Esempio n. 1
0
    def get_concatenation(self):
        for fname in self.feature_list:
            # get triple lists (patient id, time, feature value)
            if fname in self.feature_name['motor']:
                tpl_list = self.dataio.feature.motor.feature_info[fname]
            elif fname in self.feature_name['non-motor']:
                tpl_list = self.dataio.feature.nonmotor.feature_info[fname]
            elif fname in self.feature_name['biospecimen']:
                tpl_list = self.dataio.feature.biospecimen.feature_info[fname]
            elif fname in self.feature_name['image']:
                tpl_list = self.dataio.feature.image.feature_info[fname]
            elif fname in self.feature_name['medication']:
                tpl_list = self.dataio.feature.medication.feature_info[fname]
#            print (len(tpl_list))
# store the patient info
            pat_record = dict(
            )  # patient id : a list of (time stamp, feature val)
            for tpl in tpl_list:
                if isint(tpl[2]) == True:
                    fval = int(tpl[2])
                elif isfloat(tpl[2]) == True:
                    fval = float(tpl[2])
                else:
                    continue
                pat_id = tpl[0]
                time = convert_int(tpl[1])
                if pat_id not in self.patient_id:
                    continue

                if pat_id not in pat_record:
                    pat_record[pat_id] = list()
                    pat_record[pat_id].append((time, fval))
                else:
                    pat_record[pat_id].append((time, fval))

            # store the records into Patient
            fidx = self.feature_dict[fname]  # index of feature dimension
            for pat_id, tf_list in pat_record.items():
                patient = self.patient_info[pat_id]
                for time, fval in tf_list:
                    if time not in patient.patient_rec:
                        patient.patient_rec[time] = numpy.zeros(
                            self.feature_len, dtype='float32') - 1
                    patient.patient_rec[time][fidx] = fval
                self.patient_info[pat_id] = patient
        return (self.patient_info, self.feature_len)
Esempio n. 2
0
    def get_hy_stage(self):
        hy_stage = dict()
        feat_info = self.feature_info
        fname = 'MDS UPDRS PartIII'
        featname = 'H&Y'
        for fn, tpl_list in feat_info.items():
            if fn not in self.get_feature_set(fname, featname):
                continue

            pat_record = dict(
            )  # patient id : a list of (time stamp, feature val)

            for tpl in tpl_list:
                if isint(tpl[2]) == True:
                    fv = int(tpl[2])
                elif isfloat(tpl[2]) == True:
                    fv = float(tpl[2])
                else:
                    continue
                pat = tpl[0]
                time = convert_int(tpl[1])

                if pat not in pat_record:
                    pat_record[pat] = list()
                    pat_record[pat].append((time, fv))
                else:
                    pat_record[pat].append((time, fv))

            # sort for each patient according to time stamp
            pat_new_record = dict()
            for pat, tf_list in pat_record.items():
                pat_new_record[pat] = sorted(tf_list,
                                             key=operator.itemgetter(0))
                hy_stage[pat] = pat_new_record[pat][-1][1]
                print(hy_stage)
        return hy_stage
Esempio n. 3
0
    def get_biospecimen(self, dataio, K, fname=None, featname=None):
        pat_cluster = dataio.patient_cluster
        feat_info = dataio.feature.biospecimen.feature_info

        # initialization for each cluster
        for i in range(1, K + 1):
            self.BIO[str(i)] = list()
            self.BIO_first[str(i)] = list()
            self.BIO_median[str(i)] = list()

        # intialization for patient and the corresponding feature value
        pat_fval_first = dict()  # patient id : first feature value
        pat_fval_median = dict()  # patient id : median feature value
        pat_fval_last = dict()  # patient id : last feature value
        pat_fval_diff = dict()  # patient id : first-order difference
        # between last and first feature value

        for fn, tpl_list in feat_info.items():
            if fn not in self.get_feature_set(fname, featname):
                continue

            pat_record = dict(
            )  # patient id : a list of (time stamp, feature val)

            for tpl in tpl_list:
                if isint(tpl[2]) == True:
                    fv = int(tpl[2])
                elif isfloat(tpl[2]) == True:
                    fv = float(tpl[2])
                else:
                    continue
                pat = tpl[0]
                time = convert_int(tpl[1])

                if pat not in pat_cluster:
                    continue

                if pat not in pat_record:
                    pat_record[pat] = list()
                    pat_record[pat].append((time, fv))
                else:
                    pat_record[pat].append((time, fv))

            # sort for each patient according to time stamp
            pat_new_record = dict()
            for pat, tf_list in pat_record.items():
                pat_new_record[pat] = sorted(tf_list,
                                             key=operator.itemgetter(0))

            # store last, (first, median) values
            for pat, tf_list in pat_new_record.items():
                pat_fval_first, pat_fval_median, pat_fval_last = \
                self.get_feature_value(pat, tf_list, pat_fval_first, pat_fval_median, pat_fval_last)
                pat_fval_diff = self.get_feature_diff(pat, tf_list,
                                                      pat_fval_diff)

        # store feature values according to subtypes
        for pat, cls in pat_cluster.items():
            if pat in pat_fval_last:
                self.BIO[str(cls)].append(pat_fval_last[pat])
            if pat in pat_fval_first:
                self.BIO_first[str(cls)].append(pat_fval_first[pat])
            if pat in pat_fval_median:
                self.BIO_median[str(cls)].append(pat_fval_median[pat])

        # compute statistics
        # mean , std
        stats = Statistics(K)
        mean_BIO, std_BIO = stats.get_mean_std(self.BIO, is_total=False)
        mean_total_BIO, std_total_BIO = stats.get_mean_std(self.BIO,
                                                           is_total=True)
        mean_BIO_first, std_BIO_first = stats.get_mean_std(self.BIO_first,
                                                           is_total=False)
        mean_total_BIO_first, std_total_BIO_first = stats.get_mean_std(
            self.BIO_first, is_total=True)
        mean_BIO_median, std_BIO_median = stats.get_mean_std(self.BIO_median,
                                                             is_total=False)
        mean_total_BIO_median, std_total_BIO_median = stats.get_mean_std(
            self.BIO_median, is_total=True)
        if featname != None:
            fname_ = fname + '-' + featname
        else:
            fname_ = fname
        self.mean[fname_] = list()
        self.mean_first[fname_] = list()
        self.mean_median[fname_] = list()
        for i in range(1, K + 1):
            self.mean[fname_].append((mean_BIO[str(i)], std_BIO[str(i)]))
            self.mean_first[fname_].append(
                (mean_BIO_first[str(i)], std_BIO_first[str(i)]))
            self.mean_median[fname_].append(
                (mean_BIO_median[str(i)], std_BIO_median[str(i)]))
        # display
        for i in range(1, K + 1):
            print('### CLUSTER %d ####' % i)
            print(
                "The average %s value in the %d-th clusters at follow-up is: %f (%f)"
                % (fname, i, mean_BIO[str(i)], std_BIO[str(i)]))
            print(
                "The average value in the %d-th clusters at baseline is: %f (%f)"
                % (i, mean_BIO_first[str(i)], std_BIO_first[str(i)]))
            print(
                "The average value in the %d-th clusters at median is: %f (%f)"
                % (i, mean_BIO_median[str(i)], std_BIO_median[str(i)]))
        print("The total average %s at follow-up is: %f (%f)" %
              (fname, mean_total_BIO, std_total_BIO))
        print("The total average %s at baseline is: %f (%f)" %
              (fname, mean_total_BIO_first, std_total_BIO_first))
        print("The total average %s at median is: %f (%f)" %
              (fname, mean_total_BIO_median, std_total_BIO_median))
        print("##########")

        # hypothesis testing
        print("hypothesis testing...")
        if fname == 'CSF':
            if featname != None:
                fname = fname + '-' + featname
            stats.get_distribution(pat_fval_last,
                                   is_num=True,
                                   is_discretization=True)
            if featname == 'Total tau' or featname == 'Abeta 42':
                p_last = stats.get_f_oneway(pat_fval_last, pat_cluster, fname,
                                            'STATIC')
            if featname == 'p-Tau181P' or featname == 'CSF Alpha-synuclein':
                p_last = stats.get_kruskal(pat_fval_last, pat_cluster, fname,
                                           'STATIC')
            self.p_value.append([fname, None, None, p_last, None])
        else:
            if featname != None:
                fname = fname + '-' + featname
            stats.get_distribution(pat_fval_first, is_num=True)
            p_first = stats.get_chisquare(pat_fval_first, pat_cluster, fname,
                                          'FIRST')
            stats.get_distribution(pat_fval_median, is_num=True)
            p_median = stats.get_chisquare(pat_fval_median, pat_cluster, fname,
                                           'MEDIAN')
            stats.get_distribution(pat_fval_last, is_num=True)
            p_last = stats.get_chisquare(pat_fval_last, pat_cluster, fname,
                                         'LAST')
            stats.get_distribution(pat_fval_diff, is_num=True)
            p_diff = stats.get_chisquare(pat_fval_diff, pat_cluster, fname,
                                         'DIFFERENCE')
            # store into self.p_value
            self.p_value.append([fname, p_first, p_median, p_last, p_diff])

        # post hoc test
#        if p_first <= 0.05:
#            stats.get_tukeyhsd(pat_fval_first, pat_cluster, fname, 'FIRST')
#        if p_median <= 0.05:
#            stats.get_tukeyhsd(pat_fval_median, pat_cluster, fname, 'MEDIAN')
        if p_last <= 0.05:
            stats.get_tukeyhsd(pat_fval_last, pat_cluster, fname, 'LAST')
#        if p_diff <= 0.05:
#            stats.get_tukeyhsd(pat_fval_diff, pat_cluster, fname, 'DIFFERENCE')

        self.mean_total[fname] = (mean_total_BIO, std_total_BIO)
        print('-----------------------')
Esempio n. 4
0
    def get_motor(self, dataio, K, fname=None, featname=None):
        pat_cluster = dataio.patient_cluster
        feat_info = dataio.feature.motor.feature_info

        # initialization for each cluster
        for i in range(1, K + 1):
            self.MOTOR[str(i)] = list()
            self.MOTOR_first[str(i)] = list()
            self.MOTOR_median[str(i)] = list()

        # intialization for patient and the corresponding feature value
        pat_fval_first = dict()  # patient id : first feature value
        pat_fval_median = dict()  # patient id : median feature value
        pat_fval_last = dict()  # patient id : last feature value
        pat_fval_diff = dict()  # patient id : first-order difference
        # between last and first feature value

        # read and store
        for fn, tpl_list in feat_info.items():
            if fn not in self.get_feature_set(fname, featname):
                continue

            pat_record = dict(
            )  # patient id : a list of (time stamp, feature val)

            for tpl in tpl_list:
                if isint(tpl[2]) == True:
                    fv = int(tpl[2])
                elif isfloat(tpl[2]) == True:
                    fv = float(tpl[2])
                else:
                    continue
                pat = tpl[0]
                time = convert_int(tpl[1])

                if pat not in pat_cluster:
                    continue

                if pat not in pat_record:
                    pat_record[pat] = list()
                    pat_record[pat].append((time, fv))
                else:
                    pat_record[pat].append((time, fv))

            # sort for each patient according to time stamp
            pat_new_record = dict()
            for pat, tf_list in pat_record.items():
                pat_new_record[pat] = sorted(tf_list,
                                             key=operator.itemgetter(0))

            # store last, (first, median) values
            for pat, tf_list in pat_new_record.items():
                pat_fval_first, pat_fval_median, pat_fval_last = \
                self.get_feature_value(pat, tf_list, pat_fval_first, pat_fval_median, pat_fval_last)
                pat_fval_diff = self.get_feature_diff(pat, tf_list,
                                                      pat_fval_diff)

        # store feature values according to subtypes
        for pat, cls in pat_cluster.items():
            if pat in pat_fval_last:
                self.MOTOR[str(cls)].append(pat_fval_last[pat])
            if pat in pat_fval_median:
                self.MOTOR_median[str(cls)].append(pat_fval_median[pat])
            if pat in pat_fval_first:
                self.MOTOR_first[str(cls)].append(pat_fval_first[pat])
        # compute statistics
        # mean , std
        stats = Statistics(K)
        mean_MOTOR, std_MOTOR = stats.get_mean_std(self.MOTOR, is_total=False)
        mean_MOTOR_median, std_MOTOR_median = stats.get_mean_std(
            self.MOTOR_median, is_total=False)
        mean_MOTOR_first, std_MOTOR_first = stats.get_mean_std(
            self.MOTOR_first, is_total=False)
        mean_total_MOTOR, std_total_MOTOR = stats.get_mean_std(self.MOTOR,
                                                               is_total=True)
        mean_total_MOTOR_median, std_total_MOTOR_median = stats.get_mean_std(
            self.MOTOR_median, is_total=True)
        mean_total_MOTOR_first, std_total_MOTOR_first = stats.get_mean_std(
            self.MOTOR_first, is_total=True)

        if featname != None:
            fname_ = fname + '-' + featname
        else:
            fname_ = fname
        self.mean[fname_] = list()
        self.mean_first[fname_] = list()
        self.mean_median[fname_] = list()
        for i in range(1, K + 1):
            self.mean[fname_].append((mean_MOTOR[str(i)], std_MOTOR[str(i)]))
            self.mean_first[fname_].append(
                (mean_MOTOR_first[str(i)], std_MOTOR_first[str(i)]))
            self.mean_median[fname_].append(
                (mean_MOTOR_median[str(i)], std_MOTOR_median[str(i)]))
        # display
        if featname != None:
            print('feature name: %s' % featname)
        else:
            print('feature name: %s' % fname)
        for i in range(1, K + 1):
            print('### CLUSTER %d ####' % i)
            print(
                "The average value in the %d-th clusters at follow-up is: %f (%f)"
                % (i, mean_MOTOR[str(i)], std_MOTOR[str(i)]))
            print(
                "The average value in the %d-th clusters at baseline is: %f (%f)"
                % (i, mean_MOTOR_first[str(i)], std_MOTOR_first[str(i)]))
            print(
                "The average value in the %d-th clusters at median is: %f (%f)"
                % (i, mean_MOTOR_median[str(i)], std_MOTOR_median[str(i)]))
        print("The total average %s at follow-up is: %f (%f)" %
              (fname, mean_total_MOTOR, std_total_MOTOR))
        print("The total average %s at median is: %f (%f)" %
              (fname, mean_total_MOTOR_median, std_total_MOTOR_median))
        print("The total average %s at baseline is: %f (%f)" %
              (fname, mean_total_MOTOR_first, std_total_MOTOR_first))
        print("##########")

        # hypothesis testing
        print("hypothesis testing...")
        if fname == 'MDS UPDRS PartIV':
            if featname != None:
                fname = fname + '-' + featname
            # fisher exact
            stats.get_distribution(pat_fval_first, is_num=True)
            p_first = stats.get_fisher_exact(pat_fval_first, pat_cluster,
                                             fname, 'FIRST')
            stats.get_distribution(pat_fval_median, is_num=True)
            p_median = stats.get_fisher_exact(pat_fval_median, pat_cluster,
                                              fname, 'MEDIAN')
            stats.get_distribution(pat_fval_last, is_num=True)
            p_last = stats.get_fisher_exact(pat_fval_last, pat_cluster, fname,
                                            'LAST')
            stats.get_distribution(pat_fval_diff, is_num=True)
            p_diff = stats.get_fisher_exact(pat_fval_diff, pat_cluster, fname,
                                            'DIFFERENCE')
            self.p_value.append([fname, p_first, p_median, p_last, p_diff])
        else:
            if featname != None:
                fname = fname + '-' + featname
            # chi-square
            stats.get_distribution(pat_fval_first, is_num=True)
            p_first = stats.get_chisquare(pat_fval_first, pat_cluster, fname,
                                          'FIRST')
            stats.get_distribution(pat_fval_median, is_num=True)
            p_median = stats.get_chisquare(pat_fval_median, pat_cluster, fname,
                                           'MEDIAN')
            stats.get_distribution(pat_fval_last, is_num=True)
            p_last = stats.get_chisquare(pat_fval_last, pat_cluster, fname,
                                         'LAST')
            stats.get_distribution(pat_fval_diff, is_num=True)
            p_diff = stats.get_chisquare(pat_fval_diff, pat_cluster, fname,
                                         'DIFFERENCE')
            self.p_value.append([fname, p_first, p_median, p_last, p_diff])

        # post hoc test
        if p_first <= 0.05:
            stats.get_tukeyhsd(pat_fval_first, pat_cluster, fname, 'FIRST')
        if p_median <= 0.05:
            stats.get_tukeyhsd(pat_fval_median, pat_cluster, fname, 'MEDIAN')
        if p_last <= 0.05:
            stats.get_tukeyhsd(pat_fval_last, pat_cluster, fname, 'LAST')
        if p_diff <= 0.05:
            stats.get_tukeyhsd(pat_fval_diff, pat_cluster, fname, 'DIFFERENCE')

        self.mean_total[fname] = (mean_total_MOTOR, std_total_MOTOR)
        print('-----------------------')
Esempio n. 5
0
    def load_subtype(self, patient_id, patient_cluster, timestamp='baseline'):
        epsilon = sys.float_info.epsilon
        feat_info = self.feature_info
        fname = 'Motor Subtype'
        pat_TD_mean = dict()  # pid: mean feature value related with TD
        pat_PIGD_mean = dict()  # pid: mean feature value related with PIGD
        PIGD_set = set(
            ['NP2WALK', 'NP2FREZ', 'NP3GAIT', 'NP3FRZGT', 'NP3PSTBL'])
        for fn, tpl_list in feat_info.items():
            if fn not in self.get_feature_set(fname):
                continue
            pat_record = dict(
            )  # patient id : a list of (time stamp, feature val)

            for tpl in tpl_list:
                if isint(tpl[2]) == True:
                    fv = int(tpl[2])
                elif isfloat(tpl[2]) == True:
                    fv = float(tpl[2])
                else:
                    continue
                pat = tpl[0]
                time = convert_int(tpl[1])

                if pat not in patient_id:
                    continue

                if pat not in pat_record:
                    pat_record[pat] = list()
                    pat_record[pat].append((time, fv))
                else:
                    pat_record[pat].append((time, fv))

            # sort for each patient according to time stamp
            pat_new_record = dict()
            for pat, tf_list in pat_record.items():
                pat_new_record[pat] = sorted(tf_list,
                                             key=operator.itemgetter(0))

            # store first values
            for pat, tf_list in pat_new_record.items():
                if timestamp == 'baseline':
                    time_idx = 0
                    pat_fval = tf_list[time_idx][1]
                elif timestamp == 'follow-up':
                    time_idx = -1
                    pat_fval = tf_list[time_idx][1]
                elif timestamp == 'median':
                    #                    print ('--------')
                    if len(tf_list) % 2 == 1:
                        time_idx = math.floor(len(tf_list) / 2)
                        pat_fval = tf_list[time_idx][1]
#                        print (time_idx)
                    else:
                        time_idx1 = math.floor(len(tf_list) / 2)
                        time_idx2 = time_idx1 - 1
                        #                        print (time_idx1)
                        #                        print (time_idx2)
                        pat_fval = (tf_list[time_idx1][1] +
                                    tf_list[time_idx2][1]) / 2
                    print(len(tf_list))

                if fn in PIGD_set:
                    if pat not in pat_PIGD_mean:
                        pat_PIGD_mean[pat] = 0
                    pat_PIGD_mean[pat] += pat_fval
                else:
                    if pat not in pat_TD_mean:
                        pat_TD_mean[pat] = 0
                    pat_TD_mean[pat] += pat_fval
        # categorization according to motor subtypes
        for pat in patient_id:
            if pat in pat_PIGD_mean and pat in pat_TD_mean:
                pat_TD_mean[pat] /= 11
                pat_PIGD_mean[pat] /= 5
                motor_ratio = pat_TD_mean[pat] / (pat_PIGD_mean[pat] + epsilon)
                if motor_ratio >= 1.15:
                    patient_cluster[pat] = '1'  # TD Subtype
                elif motor_ratio <= 0.90:
                    patient_cluster[pat] = '2'  # PIGD Subtype
                else:
                    patient_cluster[pat] = '3'  # indetermine Subtype
        print(patient_cluster)