Esempio n. 1
0
def allAgesGeneralSUD(logger):
    '''

    Finds percentage of the total sample that has any SUD and more than 2 SUD

    Decorators:
        lD.log

    Arguments:
        logger {logging.Logger} -- logs error information
    '''
    try:

        countDict = {"any_sud": [], "morethan2_sud": []}

        # Find number of users in each race who have any SUD
        any_sud = []
        for race in table2_config["inputs"]["races"]:
            query = SQL('''
            WITH subQ AS (
            SELECT *
            FROM
                tejas.sud_race_age
            WHERE
                sud_race_age.race = {}
            )
            SELECT count(*) FROM subQ
            ''').format(Literal(race))
            data = [d[0] for d in pgIO.getAllData(query)]
            countDict["any_sud"].append(data[0])

        # Find number of users in each race who have >2 SUD
        count = {"AA": 0, "NHPI": 0, "MR": 0}

        for race in table2_config["inputs"]["races"]:
            query = SQL('''
            SELECT alc, cannabis, amphe, halluc, nicotin, cocaine,
            opioids, sedate, others, polysub, inhalant
            FROM tejas.sud_race_age
            WHERE sud_race_age.race = {}
            ''').format(Literal(race))
            data = pgIO.getAllData(query)
            for tuple in data:
                if sum(list(tuple)) >= 2:
                    count[race] += 1
        for race in count:
            countDict["morethan2_sud"].append(count[race])
    except Exception as e:
        logger.error('Cannot find general SUD counts because of {}'.format(e))
    return countDict
Esempio n. 2
0
def countRaceSetting(logger):
    '''

    This function queries the database and returns the counts of each main race: AA, NHPI, MR sorted by treatment setting.
    Parameters
    ----------
    logger : {logging.Logger}
        The logger used for logging error information
    '''

    try:
        rd = {"AA": [], "NHPI": [], "MR": []}
        for race in table1_config["inputs"]["races"]:
            counts = [0] * len(table1_config["params"]["settings"]["all"])
            count = 0
            for setting in table1_config["params"]["settings"]["all"]:
                query = SQL('''
                SELECT count(*)
                FROM tejas.race_age_t1new t1
                INNER JOIN tejas.restofusers t2
                ON t1.siteid = t2.siteid
                AND t1.backgroundid = t2.backgroundid
                WHERE t1.visit_type = {} AND t1.race = {}
                ''').format(Literal(setting), Literal(race))
                data = [d[0] for d in pgIO.getAllData(query)]
                counts[count] += data[0]
                count += 1
            rd[race] = counts
    except Exception as e:
        logger.error('countRaceSetting failed because of {}'.format(e))
    return rd
Esempio n. 3
0
def genSUDUserKeys(logger):
    '''
    This function generates a .csv file for each SUD user's (siteid, backgroundid)
    
    Parameters
    ----------
    logger : {logging.Logger}
        The logger used for logging error information
    '''
    try: 
        query = '''
        SELECT 
            patientid
        FROM
            sarah.test3
        WHERE
            sud = true
        '''

        data = pgIO.getAllData(query)

        csvfile = "../data/raw_data/SUDUser_keys.csv"

        with open(csvfile,'w+') as output:
            csv_output = csv.writer(output)

            for row in data:
                csv_output.writerow(row)
        output.close()

    except Exception as e:
        logger.error('Failed to generate list of SUD users because of {}'.format(e))

    return 
Esempio n. 4
0
def genAllKeys(logger):
    '''
    
    This function generates a .csv file of (siteid, backgroundid) of users after the first filter of age, race, sex and setting are done.
    The .csv file will then be used for the second filter by dsmno.
    
    Parameters
    ----------
    logger : {logging.Logger}
        The logger used for logging error information
    '''
    try:
        query = '''
        SELECT 
            patientid
        FROM
            sarah.test2
        '''

        data = pgIO.getAllData(query)

        csvfile = "../data/raw_data/firstfilter_allkeys.csv"

        with open(csvfile, 'w+') as output:
            csv_output = csv.writer(output)

            for row in data:
                csv_output.writerow(row)
        output.close()

    except Exception as e:
        logger.error(
            'Failed to generate list of patients because of {}'.format(e))

    return
Esempio n. 5
0
def getRace(logger):
    '''Generates raceCount.csv

    This function was used to generate the data for the raceCount.csv file, which
    gets the race and count(race) for ALL the races in raw_data.background.
    After manual selection and grouping, the races under each race in the paper (AA, NHPI, MR) were manually entered into the json config file
    Function was deleted from the main after use.

    Parameters
    ----------
    logger : {logging.Logger}
        The logger used for logging error information
    '''

    try:

        query = '''
        SELECT
        race,
        COUNT(race)
        FROM raw_data.background
        GROUP BY race
        '''

        data = pgIO.getAllData(query)
        # data = [d[0] for d in data]

    except Exception as e:
        logger.error('getRace failed because of {}'.format(e))

    return data
Esempio n. 6
0
def countMainRace(logger):
    '''
    
    This function queries the database and returns the counts of each main race: AA, NHPI, MR   
    
    Parameters
    ----------
    logger : {logging.Logger}
        The logger used for logging error information
    '''

    try:
        total = []
        for race in table1_config["inputs"]["races"]:
            query = SQL('''
            SELECT
                COUNT(*)
            FROM 
                sarah.test2 t1
            INNER JOIN 
                sarah.test3 t2
            ON
                t1.patientid = t2.patientid
            WHERE
                race = {}
            ''').format(Literal(race))
            data = [d[0] for d in pgIO.getAllData(query)]
            total.append(data[0])

    except Exception as e:
        logger.error('countMainRace failed because of {}'.format(e))

    return total
Esempio n. 7
0
def createDF_byRace_morethan2SUD(logger, race):
    '''Creates dataframe for a sample from a specified race,
    dependent variable = at least 2 sud

    This function creates a dataframe for a sample from a specified race,
    where the  dependent variable is >=2 sud and the independent variables
    are: age, sex and setting.

    Decorators:
        lD.log

    Arguments:
        logger {logging.Logger} -- logs error information
        race {str} -- 'AA', 'NHPI', or 'MR'
    '''

    try:

        query = SQL('''
        SELECT morethan2sud,age,sex,visit_type
        FROM tejas.sud_race_age
        WHERE age BETWEEN 12 AND 100
        AND race = {}
        ''').format(
            Literal(race)
        )

        data = pgIO.getAllData(query)
        sud_data = [d[0] for d in data]
        age_data = [d[1] for d in data]
        sex_data = [d[2] for d in data]
        setting_data = [d[3] for d in data]

        d = {'sud': sud_data, 'age': age_data, 'sex': sex_data, 'setting': setting_data}
        main = pd.DataFrame(data=d)
        df = main.copy()

        # Change sud column to binary, dummify the other columns
        df.replace({False:0, True:1}, inplace=True)

        main.replace(to_replace=list(range(12, 18)), value="12-17", inplace=True)
        main.replace(to_replace=list(range(18, 35)), value="18-34", inplace=True)
        main.replace(to_replace=list(range(35, 50)), value="35-49", inplace=True)
        main.replace(to_replace=list(range(50, 100)), value="50+", inplace=True)
        dummy_ages = pd.get_dummies(main['age'])
        df = df[['sud']].join(dummy_ages.ix[:, :'35-49'])

        dummy_sexes = pd.get_dummies(main['sex'])
        df = df[['sud', '12-17', '18-34', '35-49']].join(dummy_sexes.ix[:, 'M':])

        dummy_setting = pd.get_dummies(main['setting'])
        df = df[['sud', '12-17', '18-34', '35-49', 'M']].join(dummy_setting.ix[:, :'Inpatient'])

        df['intercept'] = 1.0

    except Exception as e:
        logger.error('createDF_byRace_morethan2SUD failed because of {}'.format(e))

    return df
Esempio n. 8
0
def allAgesCategorisedSUD(logger):
    '''
    
    Finds percentage of the age-binned sample that have 
    SUD of a particular substance 
    
    Decorators:
        lD.log
    
    Arguments:
        logger {logging.Logger} -- logs error information
    '''
    try:
        countDict = {
            "alc":[],
            "cannabis":[],
            "amphe":[],
            "halluc":[],
            "nicotin":[],
            "cocaine":[],
            "opioids":[],
            "sedate":[],
            "others":[],
            "polysub":[],
            "inhalant":[]
        }

        for race in table2_config["inputs"]["races"]:
            for sudcat in table2_config["params"]["sudcats"]:
                query = SQL('''
                SELECT 
                    count(*) 
                FROM 
                    sarah.test2 t1
                INNER JOIN 
                    sarah.test4 t2
                ON
                    t1.patientid = t2.patientid 
                WHERE 
                    t1.race = {}
                AND 
                    t2.{} = true
                ''').format(
                    Literal(race),
                    Identifier(sudcat)
                )
                data = [d[0] for d in pgIO.getAllData(query)]
                countDict[sudcat].append(data[0])

        # Change counts to percentage of the race sample
        resultsDict = {}
        for row in countDict:
            resultsDict[row] = divByAllAges(countDict[row])

    except Exception as e:
        logger.error('Failed to find categorised SUD counts because of {}'.format(e))

    return resultsDict
Esempio n. 9
0
def make_dataset(logger, csvfile, query):
    data = pgIO.getAllData(query)  #returns list of tuples (T/F,.......)
    with open(csvfile, 'w+') as f:
        csv_out = csv.writer(f)
        csv_out.writerow(['sud', 'race', 'age', 'sex', 'setting'])
        csv_out.writerows(data)
    f.close()
    dataset = pd.read_csv(csvfile)
    return dataset
Esempio n. 10
0
def ageBinnedCategorisedSUD(logger):
    '''
    
    Finds percentage of the age-binned sample that has 
    SUD of a particular substance 
    
    Decorators:
        lD.log
    
    Arguments:
        logger {logging.Logger} -- logs error information
    '''
    try:
        countDict = {}

        for sudcat in table2_config["params"]["sudcats"].keys():
            list1 = []
            for race in table2_config["inputs"]["races"]:
                list2 = []
                for lower, upper in zip(['1', '12', '18', '35', '50'], ['11', '17', '34', '49', '100']):
                    query = SQL('''
                    SELECT 
                        count(*) 
                    FROM 
                        sarah.test2 t1
                    INNER JOIN 
                        sarah.test4 t2
                    ON
                        t1.patientid = t2.patientid
                    WHERE 
                        t1.race = {}
                    AND 
                        t1.age BETWEEN {} AND {}
                    AND 
                        t2.{} = true
                    ''').format(
                        Literal(race),
                        Literal(lower),
                        Literal(upper),
                        Identifier(sudcat)
                    )
                    data = [d[0] for d in pgIO.getAllData(query)]
                    list2.append(data[0])
                list1.append(list2)
            countDict[sudcat] = list1

        # Change counts to percentage of the race sample
        resultsDict = {}
        for row in countDict:
            resultsDict[row] = divByAgeBins(countDict[row])

    except Exception as e:
        logger.error('Failed to find categorised SUD counts because of {}'.format(e))

    return resultsDict 
Esempio n. 11
0
def addmorethan2sudcolumn(logger):
    '''Populates the 'morethan2sud' column in tejas.sud_race_age

    This function counts the number of 'True' for each mental disorder
    for each user in tejas.sud_race_age. If they have more than 1 'True' value,
    their 'morethan2sud' column will be set to 'True'.

    Decorators:
        lD.log

    Arguments:
        logger {logging.Logger} -- logs error information
    '''
    try:
        query = '''
        SELECT
            siteid, backgroundid, alc, cannabis, amphe, halluc, nicotin,
            cocaine, opioids, sedate, others, polysub, inhalant
        FROM tejas.sud_race_age
        '''
        data = pgIO.getAllData(query)

        csvfile = '../data/raw_data/atleast2suduser_keys.csv'

        count = 0
        output = open(csvfile, 'w+')
        csv_output = csv.writer(output)
        for row in data:
            if sum(list(row[2:])) >= 2:
                csv_output.writerow(row)
        readCSV = csv.reader(open(csvfile), delimiter=",")
        for user in tqdm(readCSV):
            updateQuery = SQL('''
            UPDATE tejas.sud_race_age
            SET morethan2sud = true
            WHERE siteid = {}
            AND backgroundid = {}
            ''').format(Literal(user[0]), Literal(str(user[1])))
            value = pgIO.commitData(updateQuery)
            # print(type(user[0]))

        #Update column's null values to false
        updateQuery2 = '''
        UPDATE tejas.sud_race_age
        SET morethan2sud = false
        WHERE morethan2sud is null
        '''
        print(pgIO.commitData(updateQuery2))


    except Exception as e:
        logger.error('adding morethan2sud column to the database failed because of {}'.format(e))

    return
Esempio n. 12
0
def checkTableExistence(logger, schemaName, tableName):
    doesExistQueryString = '''
                                SELECT EXISTS 
                                (
                                SELECT 1
                                FROM   information_schema.tables 
                                WHERE  table_schema = '{}'
                                AND    table_name = '{}'
                                );
                                '''.format(schemaName, tableName)

    doesExistFlag = pgIO.getAllData(doesExistQueryString, dbName=dbName)[0][0]

    return doesExistFlag
Esempio n. 13
0
def createDF_allRaces_anySUD(logger):
    '''Creates dataframe for total sample, dependent variable = any sud

    This function creates a dataframe for the total sample, where the
    dependent variable is any sud and the independent variables are:
    race, age, sex and setting.

    Decorators:
        lD.log

    Arguments:
        logger {logging.Logger} -- logs error information
    '''

    try:

        query = '''
        SELECT * from tejas.restofusers_t3_p1
        '''
        data = pgIO.getAllData(query)
        sud_data = [d[0] for d in data]
        race_data = [d[1] for d in data]
        age_data = [d[2] for d in data]
        sex_data = [d[3] for d in data]
        setting_data = [d[4] for d in data]

        d = {'sud': sud_data, 'race': race_data, 'age': age_data, 'sex': sex_data, 'setting': setting_data}
        main = pd.DataFrame(data=d)
        df = main.copy()

        # Change sud column to binary, dummify the other columns
        df.replace({False:0, True:1}, inplace=True)

        dummy_races = pd.get_dummies(main['race'])
        df = df[['sud']].join(dummy_races.ix[:, 'MR':])
        main.replace(to_replace=list(range(12, 18)), value="12-17", inplace=True)
        main.replace(to_replace=list(range(18, 35)), value="18-34", inplace=True)
        main.replace(to_replace=list(range(35, 50)), value="35-49", inplace=True)
        main.replace(to_replace=list(range(50, 100)), value="50+", inplace=True)
        dummy_ages = pd.get_dummies(main['age'])
        df = df[['sud', 'MR', 'NHPI']].join(dummy_ages.ix[:, :'50+'])
        dummy_sexes = pd.get_dummies(main['sex'])
        df = df[['sud', 'MR', 'NHPI', '12-17', '18-34', '35-49', '50+']].join(dummy_sexes.ix[:, 'M':])
        dummy_setting = pd.get_dummies(main['setting'])
        df = df[['sud', 'MR', 'NHPI', '12-17', '18-34', '35-49', 'M']].join(dummy_setting.ix[:, :'Inpatient'])
        df['intercept'] = 1.0

    except Exception as e:
        logger.error('createDF_allRaces_anySUD failed because of {}'.format(e))
    return df
Esempio n. 14
0
def countRaceSUDppl(logger):
    result = {"AA": 0, "NHPI": 0, "MR": 0}
    for race in table2_config["inputs"]["races"]:
        query = SQL('''
        WITH subQ AS (
        SELECT *
        FROM tejas.sud_race_age
        WHERE sud_race_age.race = {}
        )
        SELECT count(*) FROM subQ
        ''').format(Literal(race))
        data = [d[0] for d in pgIO.getAllData(query)]
        if data != None:
            result[race] = data[0]
    return result
Esempio n. 15
0
 def getData(logger,
             self,
             query,
             columns=None,
             saveData=True,
             savePath='../data/intermediate',
             saveName='temp'):
     data = pgIO.getAllData(query, dbName=self.dbName)
     df = pd.DataFrame(data)
     if columns != None: df.columns = columns
     if saveData:
         if not os.path.exists(savePath):
             os.makedirs(savePath)
         dataOut.to_pickle(os.path.join(savePath, saveName + '.pkl'))
         print(saveName + '.pkl saved.')
     return dfmake
Esempio n. 16
0
def allAgesCategorisedSUD(logger):
    '''

    Finds percentage of the age-binned sample that have
    SUD of a particular substance

    Decorators:
        lD.log

    Arguments:
        logger {logging.Logger} -- logs error information
    '''
    try:
        countDict = {
            "alc": [],
            "cannabis": [],
            "amphe": [],
            "halluc": [],
            "nicotin": [],
            "cocaine": [],
            "opioids": [],
            "sedate": [],
            "others": [],
            "polysub": [],
            "inhalant": []
        }

        for sudcat in table2_config["params"]["sudcats"]:
            for race in table2_config["inputs"]["races"]:
                query = SQL('''
                WITH subQ AS (
                SELECT * FROM tejas.sud_race_age
                WHERE sud_race_age.race = {}
                AND sud_race_age.{} = true
                ) SELECT count(*) from subQ
                ''').format(Literal(race), Identifier(sudcat))
                data = [d[0] for d in pgIO.getAllData(query)]
                countDict[sudcat].append(data[0])

        # Change counts to percentage of the race sample
        resultsDict = {}

    except Exception as e:
        logger.error(
            'Failed to find categorised SUD counts because of {}'.format(e))

    return countDict
Esempio n. 17
0
def ageBinnedCategorisedSUD(logger, race):
    '''

    Finds percentage of the age-binned sample that has
    SUD of a particular substance

    Decorators:
        lD.log

    Arguments:
        logger {logging.Logger} -- logs error information
    '''
    if race == "AA":
        x = 0
    elif race == "NHPI":
        x = 1
    elif race == "MR":
        x = 2
    allAgesCatSUD = allAgesCategorisedSUD()
    try:
        countDict = {}
        for sudcat in table2_config["params"]["sudcats"].keys():
            l = [0, 0, 0, 0, 0]
            i = 0
            for lower, upper in zip(['1', '12', '18', '35', '50'],
                                    ['11', '17', '34', '49', '100']):
                query = SQL('''
                WITH subQ AS (
                SELECT * FROM tejas.sud_race_age
                WHERE sud_race_age.race = {}
                AND sud_race_age.age >= {}
                AND sud_race_age.age <= {}
                AND sud_race_age.{} = true )
                SELECT count(*) from subQ
                ''').format(Literal(race), Literal(lower), Literal(upper),
                            Identifier(sudcat))
                data = [d[0] for d in pgIO.getAllData(query)]
                l[i] = data[0]
                i += 1
            for j in range(0, len(l)):
                l[j] = genPC(l[j], allAgesCatSUD[sudcat][x])
            countDict[sudcat] = l
    except Exception as e:
        logger.error(
            'Failed to find categorised SUD counts because of {}'.format(e))
    return countDict
Esempio n. 18
0
def countRaceAge(logger):
    '''

    This function queries the database and returns the counts of each main race: AA, NHPI, MR sorted into age bins.
    Parameters
    ----------
    logger : {logging.Logger}
        The logger used for logging error information
    '''

    try:
        rd = {"AA": [], "NHPI": [], "MR": []}
        for race in table1_config["inputs"]["races"]:
            counts = [0, 0, 0, 0, 0]
            count = 0
            for lower, upper in zip(['1', '12', '18', '35', '50'],
                                    ['11', '17', '34', '49', '100']):
                query = SQL('''
                WITH subQ as (
                SELECT *
                FROM tejas.race_age_t1new t1
                INNER JOIN tejas.restofusers t2
                ON t1.siteid = t2.siteid
                AND t1.backgroundid = t2.backgroundid
                WHERE (cast (t1.age as int) >= {})
                AND (cast (t1.age as int) <= {}) and t1.race = {}
                )
                SELECT count(*)
                FROM subQ
                ''').format(Literal(lower), Literal(upper), Literal(race))
                # returns pairs so we're just interested in first element
                data = [d[0] for d in pgIO.getAllData(query)]
                #print(data)
                #counts.append(data[0])
                counts[count] += data[0]
                count += 1
            rd[race] = counts
            #print(total)
    except Exception as e:
        logger.error('countRaceAge failed because of {}'.format(e))

    return rd
Esempio n. 19
0
def LoadData(logger, argParam):
    '''download data

    This function makes a connection, downloads the data from the database. 
    
    Parameters
    ----------
    logger : {logging.Logger}
        The logger used for logging error information
    '''

    print('We are in LoadData module.')

    try:
        print('hi')
        jsonConfig = jsonref.load(open('../config/modules/loadData.json'))
        print('hEre I am.')
        schema = jsonConfig['saveData']['schema']
        table = jsonConfig['saveData']['table']
        saveFolder = jsonConfig['saveData']['saveFolder']

        query = sql.SQL('''
                        SELECT *
                        FROM {schema}.{table}
                        ''').format(schema=sql.Identifier(schema),
                                    table=sql.Identifier(table))

        data = pgIO.getAllData(query)

        # Check that the data is properly loaded
        print("-" * 10)

        data = np.array(data)
        # Save the data to the /data/raw folder
        np.save(os.path.join(saveFolder, 'raw_data.npy'), data)

        return data

    except Exception as e:
        logger.error(f'Unable to run LoadData \n {e}')
Esempio n. 20
0
def countRaceAge(logger):
    '''
    
    This function queries the database and returns the counts of each main race: AA, NHPI, MR sorted into age bins.

    Parameters
    ----------
    logger : {logging.Logger}
        The logger used for logging error information
    '''

    try:
        total = []
        for race in table1_config["inputs"]["races"]:
            counts = []
            for lower, upper in zip(['1', '12', '18', '35', '50'],
                                    ['11', '17', '34', '49', '100']):
                query = SQL('''
                SELECT
                    count(*)
                FROM 
                    sarah.test2 t1
                INNER JOIN 
                    sarah.test3 t2
                ON
                    t1.patientid = t2.patientid
                WHERE 
                    t1.age >= {} AND t1.age <= {} and t1.race = {}
                ''').format(Literal(lower), Literal(upper), Literal(race))
                data = [d[0] for d in pgIO.getAllData(query)]
                #print("age range: "+str(lower)+"-"+ str(upper)+" count: "+str(data))
                counts.append(data[0])
            total.append(counts)

    except Exception as e:
        logger.error('countRaceAge failed because of {}'.format(e))

    return total
Esempio n. 21
0
def genAllKeys(logger):
    '''

    This function generates a .csv file of (siteid, backgroundid) of users after the first filter of age, race, sex and setting are done.
    The .csv file will then be used for the second filter by dsmno.

    Parameters
    ----------
    logger : {logging.Logger}
        The logger used for logging error information
    '''
    all_userkeys = "../data/raw_data/allUserKeys.csv"
    with open(all_userkeys, 'w') as f:
        filewriter = csv.writer(f, delimiter=',')
        for race in table1_config["params"]["races"]["all"]:
            print("currently getting data for the " + race + " race")
            x = 0
            # and (cast (id as int) > 0) and (cast (id as int) < 1000)
            query = SQL('''
            select t1.siteid, t2.backgroundid
            from (
                select id, siteid
                from raw_data.background
    	        where race = {}
            ) as t1
            inner join raw_data.pdiagnose t2
            on t1.siteid = t2.siteid
            and t1.id = t2.backgroundid
            group by (t1.siteid, t2.backgroundid)
            ''').format(Literal(race))
            data = pgIO.getAllData(query)
            print("data is " + str(len(data)) + " items long")
            if len(data) > 0:
                # print("data is not none")
                for d in data:
                    filewriter.writerow([d[0], d[1]])
    f.close()
    return
Esempio n. 22
0
def countRaceSetting(logger):
    '''
    
    This function queries the database and returns the counts of each main race: AA, NHPI, MR sorted by treatment setting.

    Parameters
    ----------
    logger : {logging.Logger}
        The logger used for logging error information
    '''

    try:
        total = []
        for race in table1_config["inputs"]["races"]:
            counts = []
            for setting in table1_config["inputs"]["settings"]:
                query = SQL('''
                SELECT
                    count(*)
                FROM 
                    sarah.test2 t1
                INNER JOIN 
                    sarah.test3 t2
                ON
                    t1.patientid = t2.patientid
                WHERE 
                    t1.visit_type = {} AND t1.race = {}
                ''').format(Literal(setting), Literal(race))
                data = [d[0] for d in pgIO.getAllData(query)]
                counts.append(data[0])
            total.append(counts)

    except Exception as e:
        logger.error('countRaceSetting failed because of {}'.format(e))

    return total
Esempio n. 23
0
def addmorethan2sudcolumn(logger):
    '''Populates the 'morethan2sud' column in sarah.test4
    
    This function counts the number of 'True' for each mental disorder 
    for each user in sarah.test4. If they have more than 1 'True' value,
    their 'morethan2sud' column will be set to 'True'.
    
    Decorators:
        lD.log
    
    Arguments:
        logger {logging.Logger} -- logs error information
    '''
    try:
        query = '''
        SELECT 
            t1.patientid,
            t2.alc,
            t2.cannabis,
            t2.amphe,
            t2.halluc,
            t2.nicotin,
            t2.cocaine,
            t2.opioids,
            t2.sedate,
            t2.others,
            t2.polysub,
            t2.inhalant
        FROM
            sarah.test2 t1
        INNER JOIN 
            sarah.test4 t2
        ON
            t1.patientid = t2.patientid
        '''

        data = pgIO.getAllData(query)

        csvfile = '../data/raw_data/morethan2suduser_keys.csv'

        with open(csvfile, 'w+') as output:
            csv_output = csv.writer(output)

            for row in data:
                if sum(list(row[1:12])) >= 2:
                    csv_output.writerow(row)
        output.close()

        with open(csvfile) as f:
            readCSV = csv.reader(f, delimiter=",")

            for user in tqdm(readCSV):
                updateQuery = '''
                UPDATE 
                    sarah.test4
                SET 
                    morethan2sud = True
                WHERE
                   patientid = {}
                '''.format(user[0])
                print(pgIO.commitData(updateQuery))

                # print(type(user[0]))

        #Update column's null values to false
        updateQuery2 = '''
        UPDATE 
            sarah.test4
        SET 
            morethan2sud = False
        WHERE
           morethan2sud is null
        '''
        print(pgIO.commitData(updateQuery2))

    except Exception as e:
        logger.error(
            'adding morethan2sud column to the databse failed because of {}'.
            format(e))

    return
Esempio n. 24
0
def getData(logger):
    '''get data from mindlinc
    
    This function gets some data from the mindlinc database.
    
    Parameters
    ----------
    logger : {logging.Logger}
        The logger used for logging error information
    '''
    dbName = projConfig['inputs']['dbName']
    dbVersion = projConfig['inputs']['dbVersion']
    cohortWindow = [0, 1000]
    daysWindow = [0, 365]

    # get CGI data - target
    cgi_query = '''
                SELECT distinct on (severity, {0}.cgi.patientid, days) severity, {0}.cgi.patientid, days
                from 
                (
                    select * from {0}.typepatient
                        where age is not null 
                        and patientid >= {1} and patientid <= {2}
                        and days >= {3} and days <= {4} 
                ) as temp1
                inner join {0}.cgi
                on {0}.cgi.typepatientid = temp1.typepatientid  
                '''.format(dbVersion, cohortWindow[0], cohortWindow[1],
                           daysWindow[0], daysWindow[1])
    cgi_data = pgIO.getAllData(cgi_query, dbName=dbName)
    cgi_df = pd.DataFrame(cgi_data, columns=['cgi', 'patientID', 'days'])
    if not os.path.exists('../data/raw_data/cgi.pkl'):
        cgi_df.to_pickle('../data/raw_data/cgi.pkl')

    # get meds data - Features
    meds_query = '''
                SELECT distinct on (medication, {0}.meds.patientid, days) medication, {0}.meds.patientid, days from 
                (
                    select * from {0}.typepatient
                        where age is not null 
                        and patientid >= {1} and patientid <= {2}
                        and days >= {3} and days <= {4} 
                ) as temp1
                inner join {0}.meds
                on {0}.meds.typepatientid = temp1.typepatientid    
                '''.format(dbVersion, cohortWindow[0], cohortWindow[1],
                           daysWindow[0], daysWindow[1])
    meds_data = pgIO.getAllData(meds_query, dbName=dbName)
    meds_df = pd.DataFrame(meds_data, columns=['meds', 'patientID', 'days'])
    if not os.path.exists('../data/raw_data/meds.pkl'):
        meds_df.to_pickle('../data/raw_data/meds.pkl')

    cgiOut = cgi_df.drop('days', axis=1).groupby(['patientID'],
                                                 sort=False,
                                                 as_index=False)['cgi'].mean()

    medsOut = meds_df.drop(
        'days', axis=1).groupby('patientID', sort=False,
                                as_index=False).agg(lambda x: list(x.unique()))
    medsOut = medsOut['meds'].str.join('|').str.get_dummies().join(
        medsOut[['patientID']])

    dataOut = pd.merge(medsOut, cgiOut, how='inner', on='patientID')
    dataOut.set_index('patientID', inplace=True)
    if not os.path.exists('../data/raw_data/combined.pkl'):
        dataOut.to_pickle('../data/raw_data/combined.pkl')
    # print(dataOut.describe())

    return dataOut
Esempio n. 25
0
def createDF_allRaces_morethan2SUD(logger):
    '''Creates dataframe for total sample, dependent variable = more than 2 sud
    
    This function creates a dataframe for the total sample, where the 
    dependent variable is >=2 sud and the independent variables are: 
    race, age, sex and setting.
    
    Decorators:
        lD.log
    
    Arguments:
        logger {logging.Logger} -- logs error information
    '''

    try:

        query = '''
        SELECT 
            t2.morethan2sud,
            t1.race,
            t1.age,
            t1.sex,
            t1.visit_type
            
        FROM 
            sarah.test2 t1
        INNER JOIN 
            sarah.test4 t2
        ON
            t1.patientid = t2.patientid
        WHERE
            t1.age BETWEEN 12 AND 100
        '''

        data = pgIO.getAllData(query)
        sud_data = [d[0] for d in data]
        race_data = [d[1] for d in data]
        age_data = [d[2] for d in data]
        sex_data = [d[3] for d in data]
        setting_data = [d[4] for d in data]

        d = {
            'sud': sud_data,
            'race': race_data,
            'age': age_data,
            'sex': sex_data,
            'setting': setting_data
        }
        main = pd.DataFrame(data=d)
        df = main.copy()

        # Change sud column to binary, dummify the other columns
        df.replace({False: 0, True: 1}, inplace=True)

        dummy_races = pd.get_dummies(main['race'])
        df = df[['sud']].join(dummy_races.ix[:, 'MR':])

        main.replace(to_replace=list(range(12, 18)),
                     value="12-17",
                     inplace=True)
        main.replace(to_replace=list(range(18, 35)),
                     value="18-34",
                     inplace=True)
        main.replace(to_replace=list(range(35, 50)),
                     value="35-49",
                     inplace=True)
        main.replace(to_replace=list(range(50, 100)),
                     value="50+",
                     inplace=True)
        dummy_ages = pd.get_dummies(main['age'])
        df = df[['sud', 'MR', 'NHPI']].join(dummy_ages.ix[:, :'35-49'])

        dummy_sexes = pd.get_dummies(main['sex'])
        df = df[['sud', 'MR', 'NHPI', '12-17', '18-34',
                 '35-49']].join(dummy_sexes.ix[:, 'M':])

        dummy_setting = pd.get_dummies(main['setting'])
        df = df[['sud', 'MR', 'NHPI', '12-17', '18-34', '35-49',
                 'M']].join(dummy_setting.ix[:, :'Hospital'])

        df['intercept'] = 1.0

    except Exception as e:
        logger.error(
            'createDF_allRaces_morethan2SUD failed because of {}'.format(e))

    return df
Esempio n. 26
0
def popTest4(logger):
    '''Populates test4
    
    This function populates the table sarah.test4, which contains boolean columns 
    for each mental disorder. If a user's row has True for that column, it means
    that he/she has that disorder, and vice versa. 
    
    Decorators:
        lD.log
    
    Arguments:
        logger {logging.Logger} -- logs error information
    '''
    try:

        all_userkeys = "../data/raw_data/SUDUser_keys.csv"

        with open(all_userkeys) as f:
            readCSV = csv.reader(f, delimiter=",")

            for user in tqdm(readCSV):

                getQuery = SQL('''
                SELECT
                    patientid,
                    array_agg(distinct cast(dsmno as text)) && array[{}] as alc,
                    array_agg(distinct cast(dsmno as text)) && array[{}] as cannabis,
                    array_agg(distinct cast(dsmno as text)) && array[{}] as amphe,
                    array_agg(distinct cast(dsmno as text)) && array[{}] as halluc,
                    array_agg(distinct cast(dsmno as text)) && array[{}] as nicotin,
                    array_agg(distinct cast(dsmno as text)) && array[{}] as cocaine,
                    array_agg(distinct cast(dsmno as text)) && array[{}] as opioids,
                    array_agg(distinct cast(dsmno as text)) && array[{}] as sedate,
                    array_agg(distinct cast(dsmno as text)) && array[{}] as others,
                    array_agg(distinct cast(dsmno as text)) && array[{}] as polysub,
                    array_agg(distinct cast(dsmno as text)) && array[{}] as inhalant
                FROM
                    rwe_version1_1.pdiagnose
                WHERE
                    patientid = {}
                GROUP BY
                    patientid
                ''').format(
                    Literal(table2_config["params"]["sudcats"]["alc"]),
                    Literal(table2_config["params"]["sudcats"]["cannabis"]),
                    Literal(table2_config["params"]["sudcats"]["amphe"]),
                    Literal(table2_config["params"]["sudcats"]["halluc"]),
                    Literal(table2_config["params"]["sudcats"]["nicotin"]),
                    Literal(table2_config["params"]["sudcats"]["cocaine"]),
                    Literal(table2_config["params"]["sudcats"]["opioids"]),
                    Literal(table2_config["params"]["sudcats"]["sedate"]),
                    Literal(table2_config["params"]["sudcats"]["others"]),
                    Literal(table2_config["params"]["sudcats"]["polysub"]),
                    Literal(table2_config["params"]["sudcats"]["inhalant"]),
                    Literal(int(user[0]))
                )

                data = pgIO.getAllData(getQuery)

                pushQuery = '''
                INSERT INTO 
                    sarah.test4(patientid, alc, cannabis, amphe, halluc, nicotin, cocaine, opioids, sedate, others, polysub, inhalant)
                VALUES
                    %s
                '''

                deleteDupliQuery = '''
                DELETE FROM sarah.test4 a USING (
                    SELECT MAX(ctid) as ctid, patientid
                    FROM sarah.test4
                    GROUP BY patientid HAVING count(*) > 1
                    ) b
                WHERE a.patientid = b.patientid
                AND a.ctid <> b.ctid
                '''
                value = pgIO.commitData(deleteDupliQuery)
                if value == True:
                    print("Duplicate values succesfully deleted")

                print(pgIO.commitDataList(pushQuery, data))


    except Exception as e:
        logger. error('Failed to populate test4 table because of {}'.format(e))
    return
Esempio n. 27
0
def ageBinnedGeneralSUD(logger):
    '''
    
    Finds percentage of the age-binned sample that has any SUD and more than 2 SUD
    
    Decorators:
        lD.log
    
    Arguments:
        logger {logging.Logger} -- logs error information
    '''
    try:

        countDict = {
            "any_sud": [],
            "morethan2_sud": []
        }

        # Find number of users in each race who have any SUD, separated into age bins
        any_sud = []
        for race in table2_config["inputs"]["races"]:
            counts = []
            for lower, upper in zip(['1', '12', '18', '35', '50'], ['11', '17', '34', '49', '100']):
                query = SQL('''
                SELECT 
                    count(*)
                FROM 
                    sarah.test2 t1
                INNER JOIN 
                    sarah.test3 t2
                ON
                    t1.patientid = t2.patientid
                WHERE 
                    t1.race = {} 
                AND 
                    t1.age BETWEEN {} AND {}
                AND
                    t2.sud = true
                ''').format(
                    Literal(race),
                    Literal(lower),
                    Literal(upper)
                )
                data = [d[0] for d in pgIO.getAllData(query)]
                counts.append(data[0])
            countDict["any_sud"].append(counts)

        # Find number of users in each race who have >2 SUD, separated into age bins
        count = {
            "AA": {
                "1": 0,
                "12": 0,
                "18": 0,
                "35": 0,
                "50": 0
            },
            "NHPI": {
                "1": 0,
                "12": 0,
                "18": 0,
                "35": 0,
                "50": 0
            },
            "MR": {
                "1": 0,
                "12": 0,
                "18": 0,
                "35": 0,
                "50": 0
            }
        }

        for race in table2_config["inputs"]["races"]:
            for lower, upper in zip(['1', '12', '18', '35', '50'], ['11', '17', '34', '49', '100']):

                query = SQL('''
                SELECT 
                    t2.alc,
                    t2.cannabis,
                    t2.amphe,
                    t2.halluc,
                    t2.nicotin,
                    t2.cocaine,
                    t2.opioids,
                    t2.sedate,
                    t2.others,
                    t2.polysub,
                    t2.inhalant
                FROM
                    sarah.test2 t1
                INNER JOIN 
                    sarah.test4 t2
                ON
                    t1.patientid = t2.patientid
                WHERE 
                    t1.race = {}
                AND
                    t1.age BETWEEN {} AND {}
                ''').format(
                    Literal(race),
                    Literal(lower),
                    Literal(upper)
                )
                data = pgIO.getAllData(query)
                for tuple in data:
                    if sum(list(tuple))>=2:
                        count[race][lower]+=1

        for race in count:
            countDict["morethan2_sud"].append(list(count[race].values()))

        # Change counts to percentage of the race sample
        resultsDict = {}
        for row in countDict:
            resultsDict[row] = divByAgeBins(countDict[row])

    except Exception as e:
        logger.error('Failed to find general SUD counts because of {}'.format(e))

    return resultsDict
Esempio n. 28
0
def allAgesGeneralSUD(logger):
    '''
    
    Finds percentage of the total sample that has any SUD and more than 2 SUD
    
    Decorators:
        lD.log
    
    Arguments:
        logger {logging.Logger} -- logs error information
    '''
    try:

        countDict = {
            "any_sud": [],
            "morethan2_sud": []
        }

        # Find number of users in each race who have any SUD
        any_sud = []
        for race in table2_config["inputs"]["races"]:
            query = SQL('''
            SELECT 
                count(*)
            FROM 
                sarah.test2 t1
            INNER JOIN
                sarah.test4 t2
            ON
                t1.patientid = t2.patientid 
            WHERE 
                t1.race = {}
            ''').format(
                Literal(race)
            )
            data = [d[0] for d in pgIO.getAllData(query)]
            countDict["any_sud"].append(data[0])

        # Find number of users in each race who have >2 SUD
        count = {
            "AA": 0,
            "NHPI": 0,
            "MR": 0
        }

        for race in table2_config["inputs"]["races"]:
            query = SQL('''
            SELECT 
                t2.alc,
                t2.cannabis,
                t2.amphe,
                t2.halluc,
                t2.nicotin,
                t2.cocaine,
                t2.opioids,
                t2.sedate,
                t2.others,
                t2.polysub,
                t2.inhalant
            FROM
                sarah.test2 t1
            INNER JOIN 
                sarah.test4 t2
            ON
                t1.patientid = t2.patientid 
            WHERE 
                t1.race = {}
            ''').format(
                Literal(race)
            )
            data = pgIO.getAllData(query)
            for tuple in data:
                if sum(list(tuple))>=2:
                    count[race]+=1
        for race in count:
            countDict["morethan2_sud"].append(count[race])

        # Change counts to percentage of the race sample
        resultsDict = {}
        for row in countDict:
            resultsDict[row] = divByAllAges(countDict[row])

    except Exception as e:
        logger.error('Failed to find general SUD counts because of {}'.format(e))

    return resultsDict
Esempio n. 29
0
def genDiagCount(logger, filePath):
    '''
    
    This function generates the percentage of users per race that has a certain diagnosis
    
    Decorators:
        lD.log
    
    Arguments:
        logger {logging.Logger} 
        filePath {str}
    
    Returns:
        dict -- dictionary containing the results
    '''
    try:
        resultsDict = {
            "mood": [],
            "anxiety": [],
            "adjustment": [],
            "adhd": [],
            "sud": [],
            "psyc": [],
            "pers": [],
            "childhood": [],
            "impulse": [],
            "cognitive": [],
            "eating": [],
            "smtf": [],
            "disso": [],
            "sleep": [],
            "fd": []
        }

        with open(filePath) as json_file:  
            table1results = json.load(json_file)

        for category in resultsDict:
            for race in fig1_config["inputs"]["races"]:
                query = SQL('''
                SELECT 
                    count(*)
                FROM 
                    sarah.test3 t1
                INNER JOIN 
                    sarah.test2 t2
                ON 
                    t1.patientid = t2.patientid
                WHERE 
                    t1.{} is true
                AND 
                    t2.race = {}
                ''').format(
                    Identifier(category),
                    Literal(race)
                )
                data = [d[0] for d in pgIO.getAllData(query)]
                data = round((data[0]/table1results[race][0])*100, 1)
                resultsDict[category].append(data) #percentages

        json_file.close()

    except Exception as e:
        logger.error('Failed to generate count {}'.format(e))

    return resultsDict
Esempio n. 30
0
def ageBinnedGeneralSUD(logger):
    '''

    Finds percentage of the age-binned sample that has any SUD and more than 2 SUD

    Decorators:
        lD.log

    Arguments:
        logger {logging.Logger} -- logs error information
    '''
    try:

        countDict = {
            "any_sud": {
                "AA": [],
                "NHPI": [],
                "MR": []
            },
            "morethan2_sud": {}
        }

        # Find number of users in each race who have any SUD, separated into age bins
        for race in table2_config["inputs"]["races"]:
            ageCount = 0
            ageCounts = [0, 0, 0, 0, 0]
            for lower, upper in zip(['1', '12', '18', '35', '50'],
                                    ['11', '17', '34', '49', '100']):
                query = SQL('''
                WITH subQ AS (
                SELECT * FROM tejas.sud_race_age
                WHERE
                sud_race_age.race = {}
                AND
                sud_race_age.age BETWEEN {} AND {}
                ) SELECT count(*) FROM subQ
                ''').format(Literal(race), Literal(lower), Literal(upper))
                data = [d[0] for d in pgIO.getAllData(query)]
                ageCounts[ageCount] = data[0]
                ageCount += 1
            countDict["any_sud"][race] = ageCounts

        # Find number of users in each race who have >2 SUD, separated into age bins
        count = {"AA": [], "NHPI": [], "MR": []}

        for race in table2_config["inputs"]["races"]:
            ageCount = 0
            ageCounts = [0, 0, 0, 0, 0]
            for lower, upper in zip(['1', '12', '18', '35', '50'],
                                    ['11', '17', '34', '49', '100']):
                query = SQL('''
                SELECT alc, cannabis, amphe, halluc, nicotin, cocaine,
                    opioids, sedate, others, polysub, inhalant
                FROM tejas.sud_race_age
                WHERE sud_race_age.age >= {}
                AND sud_race_age.age <= {}
                AND sud_race_age.race = {}
                ''').format(Literal(lower), Literal(upper), Literal(race))
                data = pgIO.getAllData(query)
                for tuple in data:
                    if sum(list(tuple)) >= 2:
                        ageCounts[ageCount] += 1
                ageCount += 1
            count[race] = ageCounts
        countDict["morethan2_sud"] = count
    except Exception as e:
        logger.error(
            'Failed to find general SUD counts because of {}'.format(e))
    #print(resultsDict)
    return countDict