Example #1
0
 def sum_dx_and_bl_pos(row):
     dx = row['Init_Diagnosis']
     pos = row['AV45_wcereb_BIN1.11']
     if isnan(dx) or isnan(pos):
         return np.nan
     if float(pos) == 0.0:
         new_val = '%s_BLNeg' % (dx,)
     elif float(pos) == 1.0:
         new_val = '%s_BLPos' % (dx,)
     else:
         raise Exception("Unknown positivity: %s" % pos)
     return new_val
Example #2
0
def check_missing_rate():
    data_path = 'DorCirurgiaCategNAReduzido.csv'  #'Dados/risk_factors_cervical_cancer.csv'
    data = pd.read_csv(data_path,
                       header=0,
                       delimiter=",",
                       na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'],
                       quoting=0,
                       encoding='utf8',
                       mangle_dupe_cols=False)
    X = data
    print(X.shape)
    patients_missing = []
    ci = 0
    features_missing = []
    cj = 0
    for ix, row in X.iterrows():
        for j in X.columns:
            if (utils.isnan(row[j])):
                if (ix not in patients_missing):
                    ci += 1
                    patients_missing.append(ix)
                if (j not in features_missing):
                    cj += 1
                    features_missing.append(j)
    print(ci / X.shape[0])
    print(cj / X.shape[1])
Example #3
0
def get_lines_for_label(mdf, label, n, m=0, mins=None):
    '''
    mdf: dataframe containing averages
    label: label for which table lines are generated
    n: number of total elements for each label in mdf
    m: number of elements per row
    mins: label to be bolded for each item
    '''
    lines = []
    r = ((n // m) + 1) * m
    s = '| {} '.format(label)
    for i in range(r):
        if i < n:
            x = mdf.loc[label].iloc[i]
            value = '--' if isnan(x) else '{x:.2f}'.format(x=x)
            if i < n and mins is not None and mins[i] == label:
                s += '| **{}** '.format(value)
            else:
                s += '| {} '.format(value)
        else:
            value = '--'
            s += '| {} '.format(value)

        if m != 0 and i != 0 and i != (r - 1) and (i + 1) % m == 0:
            lines.append(s + '|')
            s = '| {} '.format(label)

    lines.append(s + '|')
    return lines
Example #4
0
 def compare(self, v1, v2):
     # even though np.array_equal also works on scalars, we don't use it
     # systematically because it does not work on list of strings
     if isinstance(v1, np.ndarray) or isinstance(v2, np.ndarray):
         v1, v2 = np.asarray(v1), np.asarray(v2)
         if v1.shape != v2.shape:
             return False, ' (shape differ: %s vs %s)' % (v1.shape, v2.shape)
         result = np.array_equal(v1, v2)
         nan_v1, nan_v2 = isnan(v1), isnan(v2)
         if (not result and np.any(nan_v1 | nan_v2) and
                 np.array_equal(nan_v1, nan_v2)):
             return False, ' but arrays contain NaNs, did you meant to ' \
                           'use assertNanEqual instead?'
         else:
             return result
     else:
         return v1 == v2
Example #5
0
 def compare(self, v1, v2):
     # even though np.array_equal also works on scalars, we don't use it
     # systematically because it does not work on list of strings
     if isinstance(v1, np.ndarray) or isinstance(v2, np.ndarray):
         v1, v2 = np.asarray(v1), np.asarray(v2)
         if v1.shape != v2.shape:
             return False, ' (shape differ: %s vs %s)' % (v1.shape,
                                                          v2.shape)
         result = np.array_equal(v1, v2)
         nan_v1, nan_v2 = isnan(v1), isnan(v2)
         if (not result and np.any(nan_v1 | nan_v2)
                 and np.array_equal(nan_v1, nan_v2)):
             return False, ' but arrays contain NaNs, did you meant to ' \
                           'use assertNanEqual instead?'
         else:
             return result
     else:
         return v1 == v2
Example #6
0
def plot_missing_rate():
    data_path = 'RotEOmbroCirurgiaCategNAReduzido.csv'  #'Dados/risk_factors_cervical_cancer.csv'
    class_name = 'Q92510_opcForca[RotEOmbro]'
    #class_name = 'Q92510_snDorPos'
    class_questionnaire = 'Q92510'
    missing_input = 'none'  #'mean'
    transform = False
    scale = True
    use_text = False
    dummy = False
    use_feature_selection = False
    data, original_attributes, categories = read.readData(
        data_path=data_path,
        class_name=class_name,
        class_questionnaire=class_questionnaire,
        missing_input=missing_input,
        dummy=dummy,
        transform_numeric=transform,
        use_text=use_text,
        skip_class_questionnaire=True)
    X = data
    print(X.shape)

    features_missing = [0, 0, 0, 0, 0]

    m = 0
    for j in range((X.shape[1])):
        cj = 0
        for i in range((X.shape[0])):
            if (utils.isnan(X[i][j])):
                cj += 1
        if (cj / X.shape[0] == 0):
            print(original_attributes[j])
            features_missing[0] += 1
        elif (cj / X.shape[0] <= 0.25):
            features_missing[1] += 1
        elif (cj / X.shape[0] <= 0.5):
            features_missing[2] += 1
        elif (cj / X.shape[0] <= 0.75):
            features_missing[3] += 1
        elif (cj / X.shape[0] < 1):
            features_missing[4] += 1

        m += cj / X.shape[0]
    print(m / X.shape[1])
    exit()
    print(features_missing)

    plt.pie(
        features_missing[::-1],
        labels=['0%', '0.05% a 25%', '26% a 50%', '51% a 75%',
                '76% a 98%'][::-1],
        colors=colors,
        startangle=90,
        radius=1,
        autopct=lambda p: '{:.0f}'.format(p * sum(features_missing) / 100))
    plt.show()
Example #7
0
def format_value_error(value, error):
    if isnan(value):
        return '--'
    if error == 0.0:
        return '{}'.format(value)
    (truncated_error, sigfigs) = process_error(error)
    formatted_value = format_value(value, sigfigs)
    # return '{:.2f} ± {:.2f}'.format(formatted_value, truncated_error)
    # return '{} ± {}'.format(formatted_value, truncated_error)
    return '{} ({})'.format(formatted_value, truncated_error)
Example #8
0
def interpret_real(s, context=None):
    """Convert a raw Real value to the float it represents.

    This is more lenient than the SGF spec: it accepts strings accepted as a
    float by the platform libc. It rejects infinities and NaNs.

    """
    result = float(s)
    if isinf(result):
        raise ValueError("infinite")
    if isnan(result):
        raise ValueError("not a number")
    return result
Example #9
0
def get_test_comparison_df(df,
                           l1,
                           l2,
                           l3=None,
                           suffix=None,
                           errors=True,
                           formatting=None):
    tests = test_all_evolutions(df, l1, l2)
    comparisons = [test['result'] for test in tests if test is not None]

    [avg1, avg2] = [np.mean(df.loc[l]) for l in [l1, l2]]
    [std1, std2] = [np.std(df.loc[l]) for l in [l1, l2]]

    effect_size = [
        process_effect_size(test['d']) if test else None for test in tests
    ]
    # hypothesis: assume that both labels obtain the same result
    hypothesis_results = [
        'Not Reject' if x == 'eq' else 'Reject' for x in comparisons
    ]

    [label1, label2] = [
        l if suffix is None else '{} {}'.format(l, suffix) for l in [l1, l2]
    ]

    lines1 = []
    lines2 = []
    lines3 = []

    if l3:
        avg3 = np.mean(df.loc[l3])
        std3 = np.std(df.loc[l3])
        label3 = l3 if suffix is None else '{} {}'.format(l3, suffix)

    n = get_num_evolutions(df)
    for i in range(0, n):
        if errors:
            line1 = format_value_error(avg1[i], std1[i])
            line2 = format_value_error(avg2[i], std2[i])
            line3 = format_value_error(avg3[i], std3[i]) if l3 else None
        else:
            line1 = '--' if isnan(avg1[i]) else '{:.2f}'.format(avg1[i])
            line2 = '--' if isnan(avg2[i]) else '{:.2f}'.format(avg2[i])
            if l3:
                line3 = '--' if isnan(avg3[i]) else '{:.2f}'.format(avg3[i])

        # we don't have to format line3 since it is not in the comparison
        if formatting == 'markdown' and i < len(comparisons):
            line1 = f'**{line1}**' if comparisons[i] == 'lt' else line1
            line2 = f'**{line2}**' if comparisons[i] == 'gt' else line2

        elif formatting == 'latex' and i < len(comparisons):
            line1 = '\\textbf{{{}}}'.format(
                line1) if comparisons[i] == 'lt' else line1
            line2 = '\\textbf{{{}}}'.format(
                line2) if comparisons[i] == 'gt' else line2

        lines1.append(line1)
        lines2.append(line2)
        lines3.append(line3) if l3 else None

    effect_size = [es if es else '--' for es in effect_size]

    if l3:
        data = {
            label1: lines1,
            label2: lines2,
            label3: lines3,
            'Effect Size': effect_size
        }
    else:
        data = {label1: lines1, label2: lines2, 'Effect Size': effect_size}

    return pd.DataFrame(data=data)
Example #10
0
            loss1 = F.nll_loss(score_s, target_s) + F.nll_loss(score_e, target_e)
            #print({'loss1:':loss1})
            
            # P(c|Q), consider mention
            pure_Q=pure_Q.clone()
            pure_Q[Q_mask.data==0]=-float('inf')
            #pure_Q.data.masked_fill_(Q_mask.data==0,-float('inf')) 
            B_max_Q=pure_Q.unsqueeze(1).expand_as(CQ_mask).clone()   # B,max_Q  --> B,max_c,max_Q  (give all Q' original score to c)
            # B_max_Q_old1=B_max_Q.clone()
            B_max_Q[CQ_mask.data==0]=-float('inf')                   # B,max_c,max_Q            mask, get each c's real Q
            B_max_Q=F.softmax(B_max_Q.view(-1,B_max_Q.size(2))).view(B_max_Q.size(0),-1,B_max_Q.size(2))  # B,max_c,max_Q , get each c's P(Q|c)  some max_c 's, have no Q
            # have max_can line, no nan/  other line, all nan
            # B_max_Q.data.masked_fill_(isnan(B_max_Q.data.cpu()).cuda(),1)   # log(p==1)==0, no loss
            # B_max_Q_old2=B_max_Q.clone()
            B_max_Q=B_max_Q.clone()
            B_max_Q[isnan(B_max_Q.data.cpu()).cuda()]=1
            # print(B_max_Q)
            # P(Q)
            ans_in_can=to_var(np.array(ans_in_can,dtype='int64'),use_cuda=self.args.cuda)           # B,
            ans_index=ans_in_can.unsqueeze(1).expand(B_max_Q.size(0),B_max_Q.size(2)).unsqueeze(1)  # B,1,max_Q
            final_Q=B_max_Q.gather(1,ans_index).squeeze(1)                                          # B,max_Q 
            # print(final_Q)
            assert torch.sum(isnan(final_Q.data.cpu()))==0
            # ans in Q
            answear=to_var(np.array(ans_in_Q,dtype='int64'),use_cuda=self.args.cuda)

            if self.args.db_softmax:
                loss2 =F.nll_loss(F.log_softmax(final_Q),answear)
            else:
                answear_index=answear.unsqueeze(1) # B,1
                predict_prob=final_Q.gather(1,answear_index.long()) # B,1
Example #11
0
def plot_followup_movements():

    data_path = '~/Faculdade/Mestrado/Projeto/scripts/Working Scripts/'
    data_path = data_path + 'EXPERIMENT_DOWNLOAD/Group_patients-with-brachial-plexus-injury/Per_questionnaire_data/'
    #data_path = data_path + 'Q61802_unified-surgical-evaluation/Responses_Q61802.csv'
    data_path = data_path + 'Q92510_unified-follow-up-assessment/Responses_Q92510.csv'

    data = pd.read_csv(data_path,
                       header=0,
                       delimiter=",",
                       na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'],
                       quoting=0,
                       encoding='utf8',
                       mangle_dupe_cols=False)

    admission_data = pd.read_csv(
        '~/Faculdade/Mestrado/Projeto/scripts/Working Scripts/FlexCotoveloNew.csv',
        header=0,
        delimiter=",",
        na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'],
        quoting=0,
        encoding='utf8',
        mangle_dupe_cols=False)

    outcome_right = 'opcForcaD[FlexCotovelo]'  #'snDorPos'
    outcome_left = 'opcForcaE[FlexCotovelo]'  #'snDorPos'
    #print(len(([int(a/30) for a in data['formTempoAval']])))
    patients_considered = {}
    patient_outcomes = {}
    #return_periods = []
    for i, row in data.iterrows():
        if (row['participant code']) not in patients_considered:
            patients_considered[row['participant code']] = row['formTempoAval']

            if (np.all(admission_data['Q44071_opcLdLesao']
                       [admission_data['participant code'] ==
                        row['participant code']] == 'D')):
                patient_outcomes[row['participant code']] = row[outcome_right]
            elif (np.all(admission_data['Q44071_opcLdLesao']
                         [admission_data['participant code'] ==
                          row['participant code']] == 'E')):
                patient_outcomes[row['participant code']] = row[outcome_left]
            else:
                'Preprocessing of side {0} not implemented'.format(
                    admission_data['Q44071_opcLdLesao'][
                        admission_data['participant code']])

            #return_periods.append(row['formTempoAval'])
        else:
            if (row['formTempoAval'] >
                    patients_considered[row['participant code']]):
                if (np.all(admission_data['Q44071_opcLdLesao']
                           [admission_data['participant code'] ==
                            row['participant code']] == 'D')):
                    if (row[outcome_right] != 'NINA'
                            and not utils.isnan(row[outcome_right])):
                        patient_outcomes[
                            row['participant code']] = row[outcome_right]
                        patients_considered[
                            row['participant code']] = row['formTempoAval']
                elif (np.all(admission_data['Q44071_opcLdLesao']
                             [admission_data['participant code'] ==
                              row['participant code']] == 'E')):
                    if (row[outcome_left] != 'NINA'
                            and not utils.isnan(row[outcome_left])):
                        patient_outcomes[
                            row['participant code']] = row[outcome_left]
                        patients_considered[
                            row['participant code']] = row['formTempoAval']

            else:
                if (utils.isnan(patient_outcomes[row['participant code']])):
                    if (np.all(admission_data['Q44071_opcLdLesao']
                               [admission_data['participant code'] ==
                                row['participant code']] == 'D')):
                        if (row[outcome_right] != 'NINA'
                                and not utils.isnan(row[outcome_right])):
                            patient_outcomes[
                                row['participant code']] = row[outcome_right]
                            patients_considered[
                                row['participant code']] = row['formTempoAval']
                    elif (np.all(admission_data['Q44071_opcLdLesao']
                                 [admission_data['participant code'] ==
                                  row['participant code']] == 'E')):
                        if (row[outcome_left] != 'NINA'
                                and not utils.isnan(row[outcome_left])):
                            patient_outcomes[
                                row['participant code']] = row[outcome_left]
                            patients_considered[
                                row['participant code']] = row['formTempoAval']

                #print(row['participant code'])
    #import pdb
    #pdb.set_trace()
    #labels = {'S':'Sim','N':'Não',np.nan:'Não informado'}
    for k in patients_considered.keys():
        patients_considered[k] = int(patients_considered[k] / 30)

    xlabels = list(np.arange(6)) + [np.nan]  #['N','S',np.nan]
    label = lambda x: 'Não informado' if utils.isnan(x) else x
    #labels = {'S':'Sim','N':'Não',np.nan:'Não informado'}
    y = [0] * 7
    for value in patient_outcomes.values():
        if (utils.isnan(value)):
            y[-1] += 1
        else:
            y[int(value)] += 1
    width = 0.8
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.bar(range(len(xlabels)), y, width=width)
    ax.set_xticks(np.arange(len(xlabels)) + width / 2)
    ax.set_yticks(range(0, 30, 5))
    ax.set_xticklabels([label(l) for l in xlabels])
    print(Counter(patient_outcomes.values()))
    plt.xlabel('Força muscular avaliada sobre flexão do cotovelo')
    plt.show()
Example #12
0
def plot_event():

    data_path = '~/Faculdade/Mestrado/Projeto/scripts/Working Scripts/'
    data_path = data_path + 'EXPERIMENT_DOWNLOAD/Group_patients-with-brachial-plexus-injury/Per_questionnaire_data/'
    data_path = data_path + 'Q44071_unified-admission-assessment/Responses_Q44071.csv'

    data = pd.read_csv(data_path,
                       header=0,
                       delimiter=",",
                       na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'],
                       quoting=0,
                       encoding='utf8',
                       mangle_dupe_cols=False)

    events_right = data.filter(like='lisTpTraumaD')
    events_left = data.filter(like='lisTpTraumaE')
    events_description = {
        'lisTpTrauma[arma]': 'Arma de fogo',
        'lisTpTrauma[moto]': 'Acidente motociclístico',
        'lisTpTrauma[auto]': 'Acidente automobilístico',
        'lisTpTrauma[atropelamento]': 'Atropelamento',
        'lisTpTrauma[cirurgia]': 'Cirurgia',
        'lisTpTrauma[corte]': 'Objeto cortante',
        'lisTpTrauma[ocupacao]': 'Acidente ocupacional',
        'lisTpTrauma[other]': 'Outros'
    }
    event_names = {}
    for c in events_right.columns:
        event_names[re.sub('D', '', c)] = [re.sub('D', 'E', c), c]

    index = 0
    x = np.arange(len(event_names.keys()))

    width = 0.4
    fig = plt.figure()
    ax = fig.add_subplot(111)
    i = 0
    events_in_plot = []
    for event in sorted(event_names.keys()):
        yleft = sum([
            a[1] for a in Counter(events_left[event_names[event][0]]).items()
            if not utils.isnan(a[0])
        ])
        yright = sum([
            a[1] for a in Counter(events_right[event_names[event][1]]).items()
            if not utils.isnan(a[0])
        ])
        if (yleft != 0 or yright != 0):
            l = plt.bar(i, yleft, width, color='blue')
            r = plt.bar(i + width, yright, width, color='red')
            events_in_plot.append(event)
        else:
            continue
        i += 1

    #print([[Counter(events_left[event_names[event][0]]) for event in y] for event in y])
    #exit()
    # y = sorted(event_names.keys())
    # left = plt.bar(x, [Counter(events_left[event_names[event][0]])['Y'] for event in y], width,color='blue')
    # right = plt.bar(x+width, [Counter(events_right[event_names[event][1]])['Y'] for event in y], width,color='red')
    ax.set_xticks(np.arange(i) + width)
    ax.set_xticklabels([events_description[e] for e in events_in_plot],
                       rotation=90)
    plt.ylabel('Frequência')
    ax.legend((l, r), ('Esquerdo', 'Direito'))
    plt.tight_layout()
    #plt.width = width
    plt.show()
def transform_to_JSON(clf,
                      fcs,
                      out='FeatureContributions.json',
                      diffsur=True,
                      X=None,
                      addline=None):
    import json
    import pandas as pd
    import utils
    if (X is None):
        if (not isinstance(clf.X, pd.DataFrame)):
            X = pd.DataFrame(clf.X, columns=clf.attributes)
        else:
            X = clf.X

    #data = read.readData(data_path = data_path, class_name = class_name)
    #newcolumns = np.append(X.columns,['Q44071_snCplexoAt',class_name])
    #newX = pd.merge(data,X,how='inner',on='Q44071_participant_code')[newcolumns]
    F = {}

    for i in range(len(fcs)):
        if (diffsur):
            for feature_index in fcs[i].keys():
                if (feature_index not in F):
                    F[feature_index] = {
                        'name':
                        clf.attributes[feature_index],
                        'ycategs':
                        sorted(
                            list([
                                a for a in set(X[X.columns[feature_index]])
                                if not utils.isnan(a)
                            ])) + ['nan'],
                        'redopoints': [],
                        'redxpoints': [],
                        'blueopoints': [],
                        'bluexpoints': []
                    }
                if (clf.X['Q44071_snCplexoAt'][i] == 'S'):
                    if (clf.y[i] == 'INSATISFATORIO'):
                        if (not utils.isnan(X[X.columns[feature_index]][i])):
                            F[feature_index]['redopoints'].append([
                                round(fcs[i][feature_index], 5),
                                F[feature_index]['ycategs'].index(
                                    X[X.columns[feature_index]][i])
                            ])
                        else:
                            F[feature_index]['redopoints'].append([
                                round(fcs[i][feature_index], 5),
                                len(F[feature_index]['ycategs']) - 1
                            ])
                    else:
                        if (not utils.isnan(X[X.columns[feature_index]][i])):
                            F[feature_index]['blueopoints'].append([
                                round(fcs[i][feature_index], 5),
                                F[feature_index]['ycategs'].index(
                                    X[X.columns[feature_index]][i])
                            ])
                        else:
                            F[feature_index]['blueopoints'].append([
                                round(fcs[i][feature_index], 5),
                                len(F[feature_index]['ycategs']) - 1
                            ])

                else:
                    if (clf.y[i] == 'INSATISFATORIO'):
                        if (not utils.isnan(X[X.columns[feature_index]][i])):
                            F[feature_index]['redxpoints'].append([
                                round(fcs[i][feature_index], 5),
                                F[feature_index]['ycategs'].index(
                                    X[X.columns[feature_index]][i])
                            ])
                        else:
                            F[feature_index]['redxpoints'].append([
                                round(fcs[i][feature_index], 5),
                                len(F[feature_index]['ycategs']) - 1
                            ])

                    else:
                        if (not utils.isnan(X[X.columns[feature_index]][i])):
                            F[feature_index]['bluexpoints'].append([
                                round(fcs[i][feature_index], 5),
                                F[feature_index]['ycategs'].index(
                                    X[X.columns[feature_index]][i])
                            ])
                        else:
                            F[feature_index]['bluexpoints'].append([
                                round(fcs[i][feature_index], 5),
                                len(F[feature_index]['ycategs']) - 1
                            ])
        else:
            for feature_index in fcs[i].keys():
                if (feature_index not in F.keys()):
                    if (isinstance(X, pd.DataFrame)):
                        F[feature_index] = {
                            'name':
                            clf.attributes[feature_index],
                            'value':
                            X.values[i][feature_index]
                            if not utils.isnan(X.values[i][feature_index]) else
                            'nan',
                            'contribution':
                            0
                        }
                    else:
                        F[feature_index] = {
                            'name':
                            clf.attributes[feature_index],
                            'value':
                            X[i][feature_index]
                            if not utils.isnan(X[i][feature_index]) else 'nan',
                            'contribution':
                            0
                        }

                F[feature_index]['contribution'] = fcs[i][feature_index]

    file = open(out, 'w')
    if (addline is not None):
        F['classification'] = addline
    jsonfile = json.dumps(F, ensure_ascii=False)
    file.write(jsonfile)
Example #14
0
assert (m.predict(['RAIN', 80, 70, 'T']) == "DON'T PLAY")
assert (m.predict(['SUNNY', 50, 50, 'T']) == 'PLAY')
assert (m.predict(['SUNNY', 50, 91, 'T']) == "DON'T PLAY")
assert (m.predict([np.nan, 50, 91, 'T']) == "DON'T PLAY")

print('Testing Decision Tree with missing values (branch_nan = True)...')
m = dt.DecisionTreeClassifier(missing_branch=True)
# data, attributes, categories  = read.readData(data_path = '../Dados/Test_with_nan.csv', class_name='Class',
#     dummy=dummy,transform_numeric=transform,use_text = use_text,missing_input='none')
X[5][0] = np.nan  # X = data[:,0:-1]
# y = np.array(data[:,-1])
m.fit(X, y)
m.to_pdf(original_attributes, out='out.pdf')
outlook_index = np.where(original_attributes == 'Outlook')[0][0]
not_nan_rows = [
    a for a in range(X.shape[0]) if not utils.isnan(X[:, outlook_index][a])
]
Xnotnan = (X[not_nan_rows, :])
ynotnan = y[not_nan_rows]
Xs, ys, d = utils.split_categ(Xnotnan, ynotnan, outlook_index,
                              list(set(Xnotnan[:, outlook_index])))

assert (np.isclose(
    (len(ynotnan) / len(y)) * utils.information_gain(ynotnan, ys),
    0.199,
    rtol=1e-2))
assert (np.isclose((len(ynotnan) / len(y)) * utils.gain_ratio(ynotnan, ys, y),
                   0.110,
                   rtol=1e-2))
#outlook, temperature, humidity, windy
assert (m.predict((['OVERCAST', 80, 90, 'T'])) == 'Play'.upper())
Example #15
0
    def build_tree(self,
                   Xc,
                   yc,
                   feature_indices,
                   depth,
                   weights,
                   pdist=None):  #,parent_fiv='root'):

        # only consider the instances that are at the node, partially or entirely

        rows_to_consider = sorted(weights.keys())
        X = Xc[rows_to_consider, :]
        y = yc[rows_to_consider]

        # calculate the class distribution at the node (absolute values)
        dist = {}
        for k in set(yc):
            dist[k] = 0
        for k in weights.keys():
            dist[yc[k]] += weights[k]

        # if all "whole" instances at the node belong to the same class or if maximum tree depth was reached
        if (utils.entropy(y) == 0
                or (len([k for k in dist.keys() if dist[k] < 1]) > 0)
                or depth == self.max_depth):

            # in case of a tie of the class distributions, final class will be the most frequent
            # class at the parent node
            if (len(dist.keys()) > 1 and len(set(dist.values())) == 1
                    and pdist is not None):
                #print('tie of class distributions. depth: %r distribution: %r' % (depth,dist))
                final_class = max(pdist.keys(), key=lambda k: pdist[k])

            # final class will be the most frequent class at the node
            else:
                final_class = max(dist.keys(), key=lambda k: dist[k])

            # return a decision node
            return Node(feature_index=None,
                        values=None,
                        branches=None,
                        branch_nan=None,
                        sample_size=sum(
                            [k for k in weights.values() if k == 1]),
                        distr=dist,
                        is_class=True,
                        final_class=final_class
                        )  #,config=parent_fiv+'->'+str(final_class))

        # get the feature and its split value(s) that maximize the information gain
        if (self.random_subspace is False and self.mtry is not None):
            nfeature_indices = random.sample(
                list(feature_indices), int(self.mtry(len(feature_indices))))
        else:
            nfeature_indices = feature_indices
        feature_index, values = self.find_split(X, y, nfeature_indices,
                                                weights)

        #if the best split could not be found, returns a decision node
        if (feature_index == -1):
            #print('best split could not be found.')
            if (len(dist.keys()) > 1 and len(set(dist.values())) == 1
                    and pdist is not None):
                #print('tie of class distributions. depth: %r distribution: %r' % (depth,dist))
                final_class = max(pdist.keys(), key=lambda k: pdist[k])

            # final class will be the most frequent class at the node
            else:
                final_class = max(dist.keys(), key=lambda k: dist[k])

            return Node(feature_index=None,
                        values=None,
                        branches=None,
                        branch_nan=None,
                        sample_size=sum(
                            [k for k in weights.values() if k == 1]),
                        distr=dist,
                        is_class=True,
                        final_class=final_class)

        # get rows where the values of X for the feature are not missing
        not_nan_rows = [
            a for a in range(X.shape[0])
            if (not utils.isnan(X[:, feature_index][a]))
        ]
        # get the rows where they are missing
        nan_rows = np.delete(list(range(X.shape[0])), not_nan_rows)

        Xnotnan = (X[not_nan_rows, :])
        ynotnan = y[not_nan_rows]
        ynan = y[nan_rows]

        # get the sets (and its weights) that result when the not missing data are split
        # based on the feature and its value(s)
        Xs, ys, dweights = utils.split(Xnotnan, ynotnan, feature_index, values)

        # if instances belong to only one subset, returns a decision node -- might be useless
        if (len(ys) < 2):
            #print('instances belong to only one subset.')

            if (len(dist.keys()) > 1 and len(set(dist.values())) == 1
                    and pdist is not None):
                #print('tie of class distributions. depth: %r distribution: %r' % (depth,dist))
                final_class = max(pdist.keys(), key=lambda k: pdist[k])

            # final class will be the most frequent class at the node
            else:
                final_class = max(dist.keys(), key=lambda k: dist[k])

            # if(self.print):
            return Node(feature_index=None,
                        values=None,
                        branches=None,
                        branch_nan=None,
                        sample_size=sum(
                            [k for k in weights.values() if k == 1]),
                        distr=dist,
                        is_class=True,
                        final_class=final_class)

        branch_nan = None
        branches = []

        # translate the dweights indexes to the weights indexes
        for i in range(len(dweights)):
            dweights[i] = dict(
                (rows_to_consider[not_nan_rows[j]], dweights[i][j])
                for j in dweights[i])
            for j in dweights[i].keys():
                if j in weights.keys():
                    dweights[i][j] = weights[j]

        # sum of the weights of the instances in the node with known values
        s = (sum([sum(x.values()) for x in dweights]))

        # for each split set
        for i in range(len(ys)):
            # if it's not empty
            if len(ys[i]) != 0:
                # C.45 approach
                if (self.missing_branch is False):
                    # calculate probability of outcome values[i], estimated as the sum of the weights
                    # of instances in the node known to have outcome values[i] divided by the sum of the
                    # weights of the cases in the node with known outcomes
                    prob_values_i = round(float(sum(dweights[i].values()) / s),
                                          5)
                    # for each instance with missing value, update its weight for the child node
                    for j in nan_rows:
                        (dweights[i])[rows_to_consider[j]] = weights[
                            rows_to_consider[j]] * prob_values_i

                branches.append(
                    self.build_tree(
                        Xc, yc, feature_indices, depth + 1, dweights[i],
                        dist))  #,parent_fiv=str(feature_index)+'->'+v))

        # nan branch approach
        if (self.missing_branch):
            # if there are samples with known values
            if (ynan.shape[0] != 0):
                # continue building tree from the nan branch

                branch_nan = self.build_tree(
                    Xc, yc, feature_indices, depth + 1,
                    dict([[a, 1] for a in np.array(rows_to_consider)[nan_rows]
                          ]), dist)  #,str(feature_index)+'->NAN')
            # if there aren't, then assign to the nan branch a decision node with no instances
            # (for future classification purposes).
            else:
                if (len(dist.keys()) > 1 and len(set(dist.values())) == 1
                        and pdist is not None):
                    #print('tie of class distributions. depth: %r distribution: %r' % (depth,dist))
                    final_class = max(pdist.keys(), key=lambda k: pdist[k])
                # final class will be the most frequent class at the node
                else:
                    final_class = max(dist.keys(), key=lambda k: dist[k])

                # assign to the nan branch a decision node
                branch_nan = Node(feature_index=None,
                                  values=None,
                                  branches=None,
                                  branch_nan=None,
                                  sample_size=0,
                                  distr={k: 0
                                         for k in set(y)},
                                  is_class=True,
                                  final_class=final_class
                                  )  #,config=parent_fiv+'->'+str(final_class))

        same_class = False
        fclass = branches[0].final_class
        import pdb
        if (fclass is not None):
            for child in range(1, len(branches)):
                if (branches[child].final_class != fclass):
                    same_class = False
                    break
                if (child == len(branches) - 1):
                    if (branch_nan):
                        if (branch_nan.final_class != fclass):
                            same_class = False
                        else:
                            same_class = True
                    else:
                        same_class = True
        if (same_class is True):
            #print('class node - all children nodes belong to the same class')
            if (len(dist.keys()) > 1 and len(set(dist.values())) == 1
                    and pdist is not None):
                #print('tie of class distributions. depth: %r distribution: %r' % (depth,dist))
                final_class = max(pdist.keys(), key=lambda k: pdist[k])

            else:
                final_class = max(dist.keys(), key=lambda k: dist[k])

            return Node(feature_index=None,
                        values=None,
                        branches=None,
                        branch_nan=None,
                        sample_size=sum(
                            [k for k in weights.values() if k == 1]),
                        distr=dist,
                        is_class=True,
                        final_class=final_class
                        )  #,config=parent_fiv+'->'+str(final_class))

        # # returns a test node with its feature index and values and its branches.
        return Node(feature_index=feature_index,
                    values=values,
                    branches=branches,
                    branch_nan=branch_nan,
                    sample_size=sum([k for k in weights.values() if k == 1]),
                    distr=dist)
    def feature_contribution(self,X=None):
        print('calculating feature contribution')
        #C = set(self.y)
        if(X is None):
            if(isinstance(self.X,pd.DataFrame)):
                X = self.X.values
            else:
                X = self.X
        else:
            if(isinstance(X, pd.DataFrame) and X.shape[1] != self.X.shape[1]):
                X = X[X.columns[[np.where(a == X.columns)[0][0] for a in self.X.columns if a in X.columns]]]
                for f in range(len(self.X.columns)):
                    if(self.X.columns[f] not in X.columns):
                        X.insert(f,self.X.columns[f],[np.nan]*X.shape[0])
            X = X.values
        if(self.control_class is None):
            if('SUCESSO' in set(self.y)):
                control_class = 'SUCESSO'
            else:
                control_class = list(set(self.y))[0]
            print('Control class set as %r' % control_class)
        else:
            control_class = self.control_class 


        fcs = []

        for i in range(X.shape[0]):

            FC = {}
            c = 0
            #for k in C:
            t_index = 0
            # if(i_index == 9):
            #     import pdb
            #     pdb.set_trace()
            
            for t in self.forest:
                if(i in self.forest[t_index].oob):
                    #print(oob[t_index])
                    t_index+=1
                    continue

                t_index +=1
                child_list = [[1,t.root]]   
                

                while len(child_list) > 0:
                    w, parent = child_list.pop(0)
                    
                    while parent.is_class is False:
                        f = parent.feature_index

                        #print(i[f])
                        #print(parent.values)
                        if(f not in FC.keys()):
                            FC[f] = 0
                        #    FC[f] =  {c:0 for c in C}

                        if(utils.isnan(X[i][f])):
                            if(parent.branch_nan is None):
                                sp = sum(parent.distr.values())
                                for c in parent.branches:
                                    child_list.append([round(w*(sum(c.distr.values()))/sp,2),c])
                                w,child = child_list.pop(0)
                            else:
                                child = parent.branch_nan
                        else:
                            if(len(parent.values) == 1):
                                if X[i][f] <= parent.values[0]:
                                    child = parent.branches[0]
                                else:
                                    child = parent.branches[1]
                            else:
                                if(str(X[i][f]) not in parent.values):
                                    if(parent.branch_nan is None):
                                        sp = sum(parent.distr.values())
                                        for c in parent.branches:
                                            child_list.append([round(w*(sum(c.distr.values()))/sp,2),c])
                                        w,child = child_list.pop(0)
                                    else:
                                        child = parent.branch_nan

                                else:

                                    child = parent.branches[parent.values.index(str(X[i][f]))]


                        sc = sum(child.distr.values())
                        if(sc == 0):
                            child.distr = t.root.distr
                            sc = sum(child.distr.values())
                        sp = sum(parent.distr.values())

                        FC[f] = FC[f] + w*(child.distr[control_class]/sc - parent.distr[control_class]/sp)

                        parent = child

            for element in FC:
                FC[element] = FC[element] / self.ntrees
                #for el in FC[element]:
                #    FC[element][el] = FC[element][el] / self.ntrees

            fcs.append(FC)
        return fcs
    def __call__(self, input):

        def _just_resize():
            img = input['img']
            w, h = img.size

            # perform scaling
            input['img'] = img.resize((self.ix, self.iy), Image.ANTIALIAS)

            if np.sum(input['loc']) != 0:
                loc = input['loc']
                loc[0, :] = loc[0, :] * self.ix / w
                loc[1, :] = loc[1, :] * self.iy / h
                input['loc'] = loc

        def _transform():
            angle = self.rangle * (2 * torch.rand(1)[0] - 1)
            grad_angle = angle * math.pi / 180
            scale = 1 + self.rscale * (2 * torch.rand(1)[0] - 1)
            transx = self.rtrans * (2 * torch.rand(1)[0] - 1)
            transy = self.rtrans * (2 * torch.rand(1)[0] - 1)

            img = input['img']
            w, h = img.size
            centerX, centerY = w // 2, h // 2

            # perform rotation
            img = img.rotate(angle, Image.BICUBIC)
            # perform translation
            img = img.transform(img.size, Image.AFFINE,
                                (1, 0, transx, 0, 1, transy))
            # perform scaling
            img = img.resize((int(math.ceil(scale * h)),
                              int(math.ceil(scale * w))),
                             Image.ANTIALIAS)

            w, h = img.size
            x1 = round((w - self.ix) // 2)
            y1 = round((h - self.iy) // 2)
            input['img'] = img.crop((x1, y1, x1 + self.ix, y1 + self.iy))

            if np.sum(input['loc']) != 0:
                loc = input['loc']

                newloc = np.ones((3, loc.shape[1]))
                newloc[0:2, :] = loc

                trans_matrix = np.array([[1,0,-1*transx], [0,1,-1*transy], [0,0,1]])
                scale_matrix = np.array([[scale,0,0], [0,scale,0], [0,0,1]])
                angle_matrix = np.array([
                    [math.cos(grad_angle),math.sin(grad_angle),0],
                    [-math.sin(grad_angle),math.cos(grad_angle),0],
                    [0,0,1]])

                # perform rotation
                newloc[0,:] = newloc[0,:] - centerY
                newloc[1,:] = newloc[1,:] - centerX
                newloc = np.dot(angle_matrix, newloc)
                newloc[0,:] = newloc[0,:] + centerY
                newloc[1,:] = newloc[1,:] + centerX
                # perform translation
                newloc = np.dot(trans_matrix, newloc)
                # perform scaling
                newloc = np.dot(scale_matrix, newloc)

                newloc[0,:] = newloc[0,:] - y1
                newloc[1,:] = newloc[1,:] - x1
                input['loc'] = newloc[0:2,:]

                for i in range(input['loc'].shape[1]):
                    if not np.isnan(input['loc'][:, i]).any():
                        if np.any(input['loc'][:, i] < 0) or \
                                        input['loc'][0,i] > self.iy or \
                                        input['loc'][1,i] > self.ix:
                            input['loc'][:, i] = np.nan
                            # TODO: fill the surrounding with normal noise
                            input['occ'][0, i] = 0

        # FIXME: create multiple images for the same sample with different occluded blocks for testing purposes
        # input['im'][:, 10:40, 22:50] = 0

        # adding one more at the end for the center landmark
        # add the center of image as the last landmark
        h, w = input['img'].size
        input['loc'] = np.hstack((input['loc'], np.array([[w // 2], [h // 2]])))

        input['occ'] = torch.cat((input['occ'], torch.ByteTensor([[1]])), 1)
        input['mask'] = torch.cat((input['mask'], torch.ByteTensor([[1]])), 1)
        orig_img = input['img']
        orig_loc = input['loc']
        orig_occ = input['occ'].clone()
        orig_mask = input['mask'].clone()

        _transform()

        if self.keep_landmarks_visible:
            # train: making sure all landmarks are still visible, if not perform
            #        another transformation
            mask = input['mask']
            mask2D = torch.cat((mask, mask), dim=0)
            landmarks = torch.from_numpy(input['loc'])
            limit = 100
            while not (mask == mask * input['occ']).all() or utils.isnan(landmarks[mask2D]).any():
                input['img'] = orig_img
                input['loc'] = orig_loc
                input['occ'] = orig_occ.clone()
                input['mask'] = orig_mask.clone()

                _transform()

                mask = input['mask']
                mask2D = torch.cat((mask, mask), dim=0)
                landmarks = torch.from_numpy(input['loc'])

                limit -= 1
                if limit == 0:
                    input['img'] = orig_img
                    input['loc'] = orig_loc
                    input['occ'] = orig_occ.clone()
                    input['mask'] = orig_mask.clone()
                    _just_resize()
                    print('using the orignal data because even after 100 transformation, there are still occluded landmarks!!!')
                    break

        input['tgt'] = self.toHeatmaps(input['loc'], self.image_resolution)

        return input
Example #18
0
 def predict(self,ex,top_n=1,pool=None,normalize_ss=False,exp_final_Q=False): 
     self.network.eval()
     train_mode=self.args.train_mode
     # old
     if train_mode=='string_match':
         if self.use_cuda:
             inputs = [e if e is None else Variable(e.cuda(async=True), volatile=True) for e in ex[:5]]
         else:
             inputs = [e if e is None else Variable(e, volatile=True) for e in ex[:5]]
         score_s, score_e = self.network(*inputs)  # no normalize, just exp
         # Decode predictions
         score_s = score_s.data.cpu()
         score_e = score_e.data.cpu()  
         max_len=15
         args = (score_s, score_e, top_n, max_len)
         # return 
         # pred_s :B,top_n    each ex's top_n start token pos
         # pred_e :B,top_n    each ex's top_n end   token pos
         # pred_score: B,top_n    each ex's top_n span's score
         if pool:
             return pool.apply_async(self.decode, args)
         else:
             return self.decode(*args)    
     
     if train_mode=='string_match_base_dis':
         dw, f, dw_mask, qw, qw_mask, Qw,Qw_mask,Q_mask,ex2Q,CQ_mask,Q_ids,Q_names,triples,cans,ids=ex
         dw, f, dw_mask, qw, qw_mask, Qw,Qw_mask = to_vars_torch(ex[:7],self.use_cuda,evaluate=True)
         [Q_mask,CQ_mask]=to_vars([Q_mask,CQ_mask], use_cuda=self.use_cuda,evaluate=True)   
         inputs=[dw, f, dw_mask, qw, qw_mask, Qw,Qw_mask,Q_mask,ex2Q]
         # return score_s, score_e, (after log softmax), Q_pure ( B,max_Q, before Q_mask,)     
         score_s, score_e ,pure_Q = self.network(*inputs)  # B,T_d,               test, just use to predict span
         # Decode predictions
         score_s = score_s.data.cpu()
         score_e = score_e.data.cpu()  
         max_len=15
         args = (score_s, score_e, cans, top_n, max_len)
         # cans: ex's all can's all token spans 
         if pool:
             handle=pool.apply_async(self.decode_candidates, args)
             ans_in_can,scores=handle.get()
         else:
             ans_in_can,scores=self.decode_candidates(*args)              
         # P(c|Q), consider mention
         pure_Q=pure_Q.clone()
         pure_Q[Q_mask.data==0]=-float('inf')
         #pure_Q.data.masked_fill_(Q_mask.data==0,-float('inf'))    
         
         B_max_Q=pure_Q.unsqueeze(1).expand_as(CQ_mask).clone()   # B,max_Q  --> B,max_c,max_Q  (give all Q' original score to c)
         B_max_Q[CQ_mask.data==0]=-float('inf')
         if self.normalize_q:            
             B_max_Q=F.softmax(B_max_Q.view(-1,B_max_Q.size(2))).view(B_max_Q.size(0),-1,B_max_Q.size(2))  # B,max_c,max_Q , get each c's P(Q|c)  some max_c 's, have no Q
         else:
             B_max_Q=torch.exp(B_max_Q)  
         # B_max_Q.data.masked_fill_(isnan(B_max_Q.data.cpu()).cuda(),0)  # some lines are invalid (lines >real can)
         B_max_Q=B_max_Q.clone()
         B_max_Q[isnan(B_max_Q.data.cpu()).cuda()]=0
         
         # P(Q)
         ans_in_can=to_var(np.array(ans_in_can,dtype='int64'),use_cuda=self.args.cuda)           # B,
         ans_index=ans_in_can.unsqueeze(1).expand(B_max_Q.size(0),B_max_Q.size(2)).unsqueeze(1)  # B,1,max_Q
         final_Q=B_max_Q.gather(1,ans_index).squeeze(1)  
         assert torch.sum(isnan(final_Q.data.cpu()))==0 
         #final_score,final_index=torch.max(final_Q,-1)                          # B,1
         final_score,final_index=torch.sort(final_Q,-1,descending=True)
         # return Q_mask,B_max_Q,final_Q,final_score,final_index
         return final_score,final_index,Q_mask,ids # B,1, each ex's predict Q's index and corresponding score
         # test: number, index : torch.max(final_Q,-1)  B,1     s: exp, (give candidate),find all can's score:B,max_c    still normalize  / B_max_Q  : exp
         #       scores, indexs:torch.sort(final_Q,-1)  B,max_Q           
         
     if train_mode=='contain' or train_mode=='NER':
         # C_pos,C_doc_mask,C_mask,Q_mask,CQ_mask:np   ex2Q,ans_in_Q ,Q_ids(each ex's all Q), list 
         dw, f, dw_mask, qw, qw_mask,Qw,Qw_mask,Q_mask,C_pos,C_doc_mask,C_mask,ex2Q,CQ_mask,Q_ids,Q_names,triples,ids=ex
         dw, f, dw_mask, qw, qw_mask,Qw,Qw_mask = to_vars_torch(ex[:7],self.use_cuda,evaluate=True)
         C_pos,C_doc_mask,C_mask,Q_mask,CQ_mask=to_vars([C_pos,C_doc_mask,C_mask,Q_mask,CQ_mask], use_cuda=self.use_cuda,evaluate=True)
         
         inputs=[dw, f, dw_mask, qw, qw_mask,Qw,Qw_mask,Q_mask,ex2Q]
         # return s (after doc mask + softmax ), Q_pure (B,max_Q, before Q_mask) 
         score,pure_Q=self.network(*inputs)
         # P(c)       
         s_masked=score*C_doc_mask.float()                                  # B,D    only keep candidate in s
         # keep watching
         if normalize_ss:
             s_masked+=0.00001
             s_normal=s_masked/torch.sum(s_masked,dim=1).expand_as(s_masked)# B,D    normalize s in candidate
         else:
             s_normal=s_masked
         # s_normal=s_masked/torch.sum(s_masked,dim=1).expand_as(s_masked)    # B,D    normalize s in candidate
         B_max_c=torch.bmm(s_normal.unsqueeze(1),C_pos.float()).squeeze(1)          # B,max_c   s: B,1,D *  B,D,max_c   sum c's pos in s  already c's prob,sum==1
         # B_max_c=B_max_c/torch.sum(B_max_c,dim=1).expand_as(B_max_c)        # just train
         # B_max_c.data.masked_fill_(C_mask.data==0,-float('inf'))            
         # B_max_c=F.softmax(B_max_c.data)                                    # B,max_c, after softmax   P(c)
         assert torch.sum(isnan(B_max_c.data.cpu()))==0  
         # P(c|Q), consider mention
         # pure_Q.data.masked_fill_(Q_mask.data==0,-float('inf'))
         pure_Q=pure_Q.clone()
         pure_Q[Q_mask.data==0]=-float('inf')
         B_max_Q=pure_Q.unsqueeze(1).expand_as(CQ_mask).clone()   # B,max_Q  --> B,max_c,max_Q  (give all Q' original score to c)
         B_max_Q[CQ_mask.data==0]=-float('inf')
         if self.normalize_q:            
             B_max_Q=F.softmax(B_max_Q.view(-1,B_max_Q.size(2))).view(B_max_Q.size(0),-1,B_max_Q.size(2))  # B,max_c,max_Q , get each c's P(Q|c)  some max_c 's, have no Q
         else:
             B_max_Q=torch.exp(B_max_Q)  
         B_max_Q=B_max_Q.clone()
         B_max_Q[isnan(B_max_Q.data.cpu()).cuda()]=0
         # B_max_Q.data.masked_fill_(isnan(B_max_Q.data.cpu()).cuda(),0)  # some lines are invalid (lines >real can)
         # P(Q)
         final_Q=torch.bmm(B_max_c.unsqueeze(1),B_max_Q).squeeze(1) # B,max_Q : bmm( B,1,max_c  P(c),  B,max_c,max_Q P(Q|c))
         final_Q=torch.exp(final_Q) if exp_final_Q else final_Q
         assert torch.sum(isnan(final_Q.data.cpu()))==0 
         
         # final_score,final_index=torch.max(final_Q,-1)            # B,1
         # return Q_mask,B_max_c,B_max_Q,final_Q,final_score,final_index
         final_score,final_index=torch.sort(final_Q,-1,descending=True)
         return final_score,final_index,Q_mask,ids # B,1, each ex's predict Q's index and corresponding score
         # test: number, index : torch.max(final_Q,-1)  B,1     s: exp, (give candidate),find all can's score:B,max_c    still normalize  / B_max_Q  : exp
         #       scores, indexs:torch.sort(final_Q,-1)  B,max_Q
         
     if train_mode=='span':
         # start_indexs,end_indexs,span_mask,span2c,C_mask, Q_mask, CQ_mask: np                  ex2Q,ans_in_Q: list
         dw, f, dw_mask, qw, qw_mask,Qw,Qw_mask,Q_mask,start_indexs,end_indexs,span_mask,span2c,C_mask,ex2Q,CQ_mask,Q_ids,Q_names,triples,ids=ex
         dw, f, dw_mask, qw, qw_mask,Qw,Qw_mask = to_vars_torch(ex[:7],self.use_cuda,evaluate=True)
         start_indexs,end_indexs,span_mask,span2c,C_mask, Q_mask, CQ_mask=to_vars\
         ([start_indexs,end_indexs,span_mask,span2c,C_mask, Q_mask, CQ_mask], use_cuda=self.use_cuda,evaluate=True)
         inputs=[dw, f, dw_mask, qw, qw_mask,Qw,Qw_mask,Q_mask,ex2Q]
         # return score_s, score_e, (after doc mask ,softmax), Q_pure (B,max_Q, before mask) 
         score_s, score_e,pure_Q=self.network(*inputs)
         
         # combine start_indexs,end_indexs,span_mask,span2c,C_mask to compute B,max_c      combile Q_mask pure Q       combine CQ_mask  real Q 
         # P(c)
         span_start=score_s.gather(dim=1,index=start_indexs) #  B,D--> B,max_span * span_mask (softmax), softmax_over_span.each span's score 
         span_end=score_e.gather(dim=1,index=end_indexs)     #  B,D--> B,max_span
         span_s=span_start*span_end*span_mask.float()
         if normalize_ss:
             span_s+=0.00001
             span_normal=span_s/torch.sum(span_s,dim=1).expand_as(span_s)           # normalize  B,max_span, each span's score, after mask
         else:
             span_normal=span_s
         #span_normal=span_s/torch.sum(span_s,dim=1).expand_as(span_s)
         B_max_c=torch.bmm(span_normal.unsqueeze(1),span2c.float()).squeeze(1)  # B,1,max_span  B,max_span,max_can     bmm-->     B,max_num_c
         # B_max_c.data.masked_fill_(C_mask.data==0,-float('inf'))            
         # B_max_c=F.softmax(B_max_c.data)                                   # B,max_c, after softmax   P(c)
         assert torch.sum(isnan(B_max_c.data.cpu()))==0  
                   
         # P(c|Q), consider mention
         # pure_Q.data.masked_fill_(Q_mask.data==0,-float('inf'))
         pure_Q=pure_Q.clone()
         pure_Q[Q_mask.data==0]=-float('inf')
         B_max_Q=pure_Q.unsqueeze(1).expand_as(CQ_mask).clone()   # B,max_Q  --> B,max_c,max_Q  (give all Q' original score to c)
         B_max_Q[CQ_mask.data==0]=-float('inf')
         if self.normalize_q:            
             B_max_Q=F.softmax(B_max_Q.view(-1,B_max_Q.size(2))).view(B_max_Q.size(0),-1,B_max_Q.size(2))  # B,max_c,max_Q , get each c's P(Q|c)  some max_c 's, have no Q
         else:
             B_max_Q=torch.exp(B_max_Q)  # B,max_c,max_Q , get each c's P(Q|c)  some max_c 's, have no Q
         # have max_can line, no nan/  other line, all nan
         B_max_Q=B_max_Q.clone()
         B_max_Q[isnan(B_max_Q.data.cpu()).cuda()]=0
         # B_max_Q.data.masked_fill_(isnan(B_max_Q.data.cpu()).cuda(),0)  # some lines are invalid (lines >real can)
         
         # P(Q)
         final_Q=torch.bmm(B_max_c.unsqueeze(1),B_max_Q).squeeze(1) # B,max_Q : bmm( B,1,max_c  P(c),  B,max_c,max_Q P(Q|c))
         final_Q=torch.exp(final_Q) if exp_final_Q else final_Q
         assert torch.sum(isnan(final_Q.data.cpu()))==0 
         
         final_score,final_index=torch.sort(final_Q,-1,descending=True)
         # final_score1,final_index1=final_score1[:,0],final_index1[:,0]
         
         # final_score2,final_index2=torch.max(final_Q,-1)                          # B,1
         # final_score2,final_index2=final_score2.squeeze(1),final_index2.squeeze(1)
         # print(final_Q)
         #print(final_index)
         #print(Q_mask)
         # return Q_mask,B_max_c,B_max_Q,final_Q,final_score,final_index
         return final_score,final_index,Q_mask,ids
Example #19
0
def plot_feature_contributions(X,
                               feature_index,
                               fcs,
                               attributes,
                               class_of_interest,
                               title=None):

    if (not utils.isint(X[utils.firstNotNan(
            X[:, feature_index])][feature_index]) and not utils.isfloat(
                X[utils.firstNotNan(X[:, feature_index])][feature_index])):
        values = [i for i in set(X[:, feature_index]) if not utils.isnan(i)
                  ] + [np.nan]

        pos_fcs = []
        neg_fcs = []
        pos_values = []
        neg_values = []
        zero_fcs = []
        zero_values = []
        contributions = {}

        for i in range(X.shape[0]):

            if (feature_index in fcs[i].keys()):
                if (fcs[i][feature_index][class_of_interest] > 0):
                    pos_fcs.append(fcs[i][feature_index][class_of_interest])
                    #this is necessary because of weird behavior when X[i][feature_index] is nan
                    #and for some reason it says that nan is not values
                    if (utils.isnan(X[i][feature_index])):
                        pos_values.append(len(values) - 1)
                    else:
                        pos_values.append(values.index(X[i][feature_index]))
                elif (fcs[i][feature_index][class_of_interest] == 0):
                    zero_fcs.append(0)
                    if (utils.isnan(X[i][feature_index])):
                        zero_values.append(len(values) - 1)
                    else:
                        zero_values.append(values.index(X[i][feature_index]))
                else:
                    neg_fcs.append(fcs[i][feature_index][class_of_interest])
                    if (utils.isnan(X[i][feature_index])):
                        neg_values.append(len(values) - 1)
                    else:
                        neg_values.append(values.index(X[i][feature_index]))
                if (X[i][feature_index] not in contributions.keys()):
                    contributions[X[i][feature_index]] = [
                        fcs[i][feature_index][class_of_interest]
                    ]
                else:
                    contributions[X[i][feature_index]].append(
                        fcs[i][feature_index][class_of_interest])

        print('Contributions:')
        for value in contributions.keys():
            print('Value %r' % value)
            print(
                '\nMean: %r Variance: %r' %
                (np.mean(contributions[value]), np.var(contributions[value])))

        c = (contributions.items())
        boxplot([a[1] for a in c], [a[0] for a in c], title=None)

        ax = plt.subplot(111)
        plt.plot(pos_fcs, pos_values, 'x', color='blue')
        plt.plot(neg_fcs, neg_values, 'x', color='red')
        plt.plot(zero_fcs, zero_values, 'x', color='black')
        plt.xlabel('feature contribution')
        plt.ylabel('values of feature %r' % attributes[feature_index])
        ax.set_yticks(np.array(range(len(values) + 2)) - 1)
        ax.set_yticklabels([str('')] + values + [str('')])
        plt.show()

    else:

        values = sorted([
            round(i, 4) for i in (set(X[:, feature_index]))
            if not utils.isnan(i)
        ])  # + [np.nan]

        nan_index = values[-1] - values[-2]
        pos_fcs = []
        neg_fcs = []
        pos_values = []
        neg_values = []
        zero_fcs = []
        zero_values = []
        contributions = {}

        for i in range(X.shape[0]):
            if (feature_index in fcs[i].keys()):

                if (fcs[i][feature_index][class_of_interest] > 0):
                    pos_fcs.append(fcs[i][feature_index][class_of_interest])
                    #this is necessary because of weird behavior when X[i][feature_index] is nan
                    #and for some reason it says that nan is not values
                    if (utils.isnan(X[i][feature_index])):
                        pos_values.append(values[-1] + nan_index)
                    else:
                        pos_values.append(X[i][feature_index])
                elif (fcs[i][feature_index][class_of_interest] == 0):
                    zero_fcs.append(0)
                    if (utils.isnan(X[i][feature_index])):
                        zero_values.append(values[-1] + nan_index)
                    else:
                        zero_values.append(X[i][feature_index])
                else:
                    neg_fcs.append(fcs[i][feature_index][class_of_interest])
                    if (utils.isnan(X[i][feature_index])):
                        neg_values.append(values[-1] + nan_index)
                    else:
                        neg_values.append((X[i][feature_index]))
                if (utils.isnan(X[i][feature_index])):
                    if ('nan' in contributions.keys()):
                        contributions['nan'].append(
                            fcs[i][feature_index][class_of_interest])
                    else:
                        contributions['nan'] = [
                            fcs[i][feature_index][class_of_interest]
                        ]
                elif (X[i][feature_index] in contributions.keys()):
                    contributions[(X[i][feature_index])].append(
                        fcs[i][feature_index][class_of_interest])
                else:
                    contributions[(X[i][feature_index])] = [
                        fcs[i][feature_index][class_of_interest]
                    ]

        print('Contributions:')
        for value in contributions.keys():
            print('Value %r' % value)
            print(
                'Mean: %r Variance: %r' %
                (np.mean(contributions[value]), np.std(contributions[value])))
        c = (contributions.items())
        boxplot([a[1] for a in c], [a[0] for a in c], title=None)
        fig, ax = plt.subplots()
        plt.plot(pos_fcs, pos_values, 'x', color='blue')
        plt.plot(neg_fcs, neg_values, 'x', color='red')
        plt.plot(zero_fcs, zero_values, 'x', color='black')
        fig.canvas.draw()
        labels = [''] + [item.get_text()
                         for item in ax.get_yticklabels()] + ['']
        if (values[-1] + nan_index < ax.get_yticks()[-1]):
            plt.yticks(
                [values[0] - nan_index] +
                sorted(list(ax.get_yticks()) + [values[-1] + nan_index]))
        else:
            plt.yticks([values[0] - nan_index] + sorted(
                list(ax.get_yticks()) +
                [values[-1] + nan_index, values[-1] + 2 * nan_index]))
        labels[-2] = 'nan'

        plt.xlabel('feature contribution')
        plt.ylabel('values of feature %r' % attributes[feature_index])
        ax.set_yticklabels(labels)

        plt.show()

    if (title is not None):
        plt.savefig(title)
        plt.close()
Example #20
0
def plot_feature_contributions_surgery_class(X,
                                             y,
                                             feature_index,
                                             fcs,
                                             attributes,
                                             class_of_interest,
                                             title=None):
    surgery_index = np.where(attributes == 'Q44071_snCplexoAt')[0][0]

    if (not utils.isint(X[utils.firstNotNan(
            X[:, feature_index])][feature_index]) and not utils.isfloat(
                X[utils.firstNotNan(X[:, feature_index])][feature_index])):
        values = [i for i in set(X[:, feature_index]) if not utils.isnan(i)
                  ] + [np.nan]

        x_surgery = []
        surgery_colors = []
        x_no_surgery = []
        no_surgery_colors = []
        x_nan = []
        nan_colors = []
        y_surgery = []
        y_no_surgery = []
        y_nan = []

        contributions = {}

        for i in range(X.shape[0]):

            if (feature_index in fcs[i].keys()):

                if (X[i][surgery_index] == 'S' or X[i][surgery_index] == 'Y'):
                    x_surgery.append(fcs[i][feature_index][class_of_interest])
                    y_surgery.append(values.index(X[i][feature_index]))
                    if (y[i] == class_of_interest):
                        surgery_colors.append('blue')
                    else:
                        surgery_colors.append('red')

                elif (utils.isnan(X[i][surgery_index])):
                    x_nan.append(fcs[i][feature_index][class_of_interest])
                    #this is necessary because of weird behavior when X[i][feature_index] is nan
                    #and for some reason it says that nan is not values
                    y_nan.append(len(values) - 1)
                    if (y[i] == class_of_interest):
                        nan_colors.append('blue')
                    else:
                        nan_colors.append('red')
                else:
                    x_no_surgery.append(
                        fcs[i][feature_index][class_of_interest])
                    y_no_surgery.append(values.index(X[i][feature_index]))
                    if (y[i] == class_of_interest):
                        no_surgery_colors.append('blue')
                    else:
                        no_surgery_colors.append('red')

                # if(X[i][feature_index] not in contributions.keys()):
                #     contributions[X[i][feature_index]] = [fcs[i][feature_index][class_of_interest]]
                # else:
                #     contributions[X[i][feature_index]].append(fcs[i][feature_index][class_of_interest])
        coi = str(class_of_interest)
        ax = plt.subplot(111)
        ax.scatter(x_surgery,
                   y_surgery,
                   marker='o',
                   s=60,
                   edgecolors=surgery_colors,
                   facecolors='none')
        ax.scatter(x_no_surgery,
                   y_no_surgery,
                   marker='x',
                   s=60,
                   edgecolors=no_surgery_colors,
                   facecolors='none')
        ax.scatter(x_nan,
                   y_nan,
                   marker='d',
                   s=60,
                   edgecolors=nan_colors,
                   facecolors='none')
        plt.xlabel('feature contribution')
        plt.ylabel('values of feature %r' % attributes[feature_index])
        ax.set_yticks(np.array(range(len(values) + 2)) - 1)
        ax.set_yticklabels([str('')] + values + [str('')])
        red_patch = mpatches.Patch(color='red')
        blue_patch = mpatches.Patch(color='blue')
        xmarker = mlines.Line2D([], [],
                                color='black',
                                marker='x',
                                markersize=10,
                                linestyle='None')
        omarker = mlines.Line2D([], [],
                                color='black',
                                marker='o',
                                markersize=10,
                                linestyle='None',
                                markerfacecolor='None',
                                markeredgecolor='black')
        #plt.legend(handles=[red_patch,blue_patch])

        plt.legend([red_patch, blue_patch, xmarker, omarker], [
            'Classe da instância ≠ ' + coi, 'Classe da instância = ' + coi,
            'Não passou por cirurgia', 'Passou por cirurgia'
        ],
                   numpoints=1,
                   fontsize='small')
        plt.show()

    else:

        values = sorted([
            round(i, 4) for i in (set(X[:, feature_index]))
            if not utils.isnan(i)
        ])  # + [np.nan]
        print(values)
        nan_index = values[-1] - values[-2]
        x_surgery = []
        surgery_colors = []
        x_no_surgery = []
        no_surgery_colors = []
        x_nan = []
        nan_colors = []
        y_surgery = []
        y_no_surgery = []
        y_nan = []

        for i in range(X.shape[0]):
            if (feature_index in fcs[i].keys()):
                if (X[i][surgery_index] == 'S' or X[i][surgery_index] == 'Y'):
                    x_surgery.append(fcs[i][feature_index][class_of_interest])
                    y_surgery.append((X[i][feature_index]))
                    if (y[i] == class_of_interest):
                        surgery_colors.append('blue')
                    else:
                        surgery_colors.append('red')
                elif (utils.isnan(X[i][surgery_index])):
                    x_nan.append(fcs[i][feature_index][class_of_interest])
                    #this is necessary because of weird behavior when X[i][feature_index] is nan
                    #and for some reason it says that nan is not values
                    y_nan.append(values[-1] + nan_index)
                    if (y[i] == class_of_interest):
                        nan_colors.append('blue')
                    else:
                        nan_colors.append('red')
                else:
                    x_no_surgery.append(
                        fcs[i][feature_index][class_of_interest])
                    y_no_surgery.append((X[i][feature_index]))
                    if (y[i] == class_of_interest):
                        no_surgery_colors.append('blue')
                    else:
                        no_surgery_colors.append('red')
        coi = str(class_of_interest)
        fig, ax = plt.subplots()
        ax.scatter(x_surgery,
                   y_surgery,
                   marker='o',
                   s=60,
                   facecolors='none',
                   edgecolors=surgery_colors)
        ax.scatter(x_no_surgery,
                   y_no_surgery,
                   marker='x',
                   s=60,
                   edgecolors=no_surgery_colors)
        ax.scatter(x_nan,
                   y_nan,
                   marker='d',
                   s=60,
                   facecolors='none',
                   edgecolors=nan_colors)
        fig.canvas.draw()
        labels = [''] + [item.get_text()
                         for item in ax.get_yticklabels()] + ['']
        if (values[-1] + nan_index < ax.get_yticks()[-1]):
            plt.yticks(
                [values[0] - nan_index] +
                sorted(list(ax.get_yticks()) + [values[-1] + nan_index]))
        else:
            plt.yticks([values[0] - nan_index] + sorted(
                list(ax.get_yticks()) +
                [values[-1] + nan_index, values[-1] + 2 * nan_index]))
        labels[-2] = 'nan'

        plt.xlabel('feature contribution')
        plt.ylabel('values of feature %r' % attributes[feature_index])
        ax.set_yticklabels(labels)
        red_patch = mpatches.Patch(color='red')
        blue_patch = mpatches.Patch(color='blue')
        xmarker = mlines.Line2D([], [],
                                color='black',
                                marker='x',
                                markersize=10,
                                label='Bla',
                                linestyle='None')
        omarker = mlines.Line2D([], [],
                                color='black',
                                marker='o',
                                markersize=10,
                                label='Bla',
                                linestyle='None',
                                markerfacecolor='None',
                                markeredgecolor='black')
        #plt.legend(handles=[red_patch,blue_patch])
        plt.legend([red_patch, blue_patch, xmarker, omarker], [
            'Classe da instância ≠ ' + coi, 'Classe da instância = ' + coi,
            'Não passou por cirurgia', 'Passou por cirurgia'
        ],
                   numpoints=1,
                   fontsize='small')
        plt.show()

    if (title is not None):
        plt.savefig(title)
        plt.close()
        f = open(title, 'w')
        f.write('X=' + str(X))
        f.write('\ny=' + str(y))
        f.write('\nfcs=' + str(fcs))
        f.write('\nfeatures=' + str(attributes))
        f.write('\nfeature_index=' + str(feature_index))
        f.write('\nvalues=' + str(values))
        f.write('\nx_surgery=' + str(x_surgery))
        f.write('\ny_surgery=' + str(y_surgery))
        f.write('\nsurgery_colors=' + str(surgery_colors))
        f.write('\nx_no_surgery=' + str(x_no_surgery))
        f.write('\ny_no_surgery=' + str(y_no_surgery))
        f.write('\nno_surgery_colors=' + str(no_surgery_colors))
        f.write('\nx_nan=' + str(x_nan))
        f.write('\ny_nan=' + str(y_nan))
        f.write('\nnan_colors=' + str(nan_colors))
Example #21
0
    def refineDataFrame(self):
        """ Refines self.df to set the default download mode as 'audio'
        and creates new entries for default_title, default_artist, and
        default_album to check if fresh download is required.
        
        Usage:
        -----
        self.refineDataFrame()
        
        Returns:
        -------
        NULL
        
        Creates:
        -------
        self.df : pandas dataframe
            Creates 3 new columns default_title, default_artist, and
            default_album from existing download. If no file exists it
            is left blank.
        """

        self.df['default_title'] = ''
        self.df['default_artist'] = ''
        self.df['default_album'] = ''
        [row, col] = self.df.shape
        for r in range(row):
            if (utils.isnan(self.df['mode'][r])):
                self.df['mode'][r] = 'audio'
            if (isinstance(self.df['title'][r], str)):
                if (self.df['mode'][r] == 'audio'):
                    if (os.path.exists('audio/' + self.df['title'][r] +
                                       '.mp3')):
                        title, artist, album = utils.get_metadata_file(
                            'audio/' + self.df['title'][r] + '.mp3')
                    else:
                        title = self.df['title'][r]
                        if not (utils.isnan(self.df['artist'][r])):
                            artist = self.df['artist'][r]
                        if not (utils.isnan(self.df['album'][r])):
                            album = self.df['album'][r]
                elif (self.df['mode'][r] == 'video'):
                    if (os.path.exists('video/' + self.df['title'][r] +
                                       '.mp4')):
                        title, artist, album = utils.get_metadata_file(
                            'video/' + self.df['title'][r] + '.mp4')
                    else:
                        title = self.df['title'][r]
                        if not (utils.isnan(self.df['artist'][r])):
                            artist = self.df['artist'][r]
                        if not (utils.isnan(self.df['album'][r])):
                            album = self.df['album'][r]
                else:
                    sys.exit('Not a valid mode. Quitting program.')
            else:
                title, artist, album = utils.get_metadata_link(
                    self.df['link'][r])
            if (utils.isnan(self.df['title'][r])):
                self.df['title'][r] = title
            if (utils.isnan(self.df['artist'][r])):
                self.df['artist'][r] = ''
            if (utils.isnan(self.df['album'][r])):
                self.df['album'][r] = ''
            self.df['default_title'][r] = title
            self.df['default_artist'][r] = artist
            self.df['default_album'][r] = album
Example #22
0
    def forward(self, dw, f, dw_mask, qw, qw_mask, Qw, Qw_mask, Q_mask, ex2Q):
        # embeddings
        dw_emb = self.embedding(dw)  # B,|D|,h
        qw_emb = self.embedding(qw)  # B,|Q|,h
        Qw_emb = self.embedding(Qw)  # Q_max,|Q_tokens|,h
        B = len(dw_emb)
        # Q=len(Qw_emb)
        # dropout on embeddings
        if self.args.dropout_emb > 0:
            dw_emb = F.dropout(dw_emb,
                               p=self.args.dropout_emb,
                               training=self.training)
            qw_emb = F.dropout(qw_emb,
                               p=self.args.dropout_emb,
                               training=self.training)
            Qw_emb = F.dropout(Qw_emb,
                               p=self.args.dropout_emb,
                               training=self.training)
        # each doc token's att sum vector for query, as this token's soft feature vector (compare with q_in_token)
        doc_input = [dw_emb]
        if self.args.doc_use_qemb:
            if self.self_linear:
                dw_project = self.Linear_self(dw_emb.view(
                    -1, self.embed_size)).view(B, -1,
                                               self.embed_size)  # B,|D|,h
                dw_project = F.relu(dw_project)
                qw_project = self.Linear_self(qw_emb.view(
                    -1, self.embed_size)).view(B, -1,
                                               self.embed_size)  # B,|Q|,h
                qw_project = F.relu(qw_project)
            else:
                dw_project = dw_emb
                qw_project = qw_emb
            b2q_att = torch.bmm(dw_project, qw_project.transpose(
                2, 1))  # B,|D|,|Q|, each d to all q's attention score
            b2q_att = b2q_att.clone()
            b2q_att[qw_mask.unsqueeze(1).expand_as(
                b2q_att).data] = -float('inf')
            # b2q_att.data.masked_fill_(qw_mask.unsqueeze(1).expand_as(b2q_att).data,-float('inf'))  # masked with q's real len
            b2q_att = F.softmax(b2q_att.view(-1, qw_emb.size(1))).view(
                B, dw_project.size(1), qw_project.size(
                    1))  # and softmax  B,|D|,|Q|      0.1,0.3,0.6,0
            b2q_each_vec = torch.bmm(
                b2q_att,
                qw_project)  # B,|D|,h_q   each d's summed attention to Q: 1,h
            doc_input.append(b2q_each_vec)
        if self.args.num_features > 0:
            doc_input.append(f)

    # doc encoder
    #  B,|D|,h,  (B,|D|,h),  B,|D|,n_f
        doc_input = torch.cat(doc_input, 2)
        # no padding
        if (self.training
                and not self.args.rnn_padding) or dw_mask.data.sum() == 0:
            outputs = [doc_input]
            hns = []
            for i in range(len(self.doc_encoder)):
                inputs = outputs[-1]
                # dropout on this layyer
                inputs = F.dropout(inputs,
                                   training=self.training,
                                   p=self.dropout_rnn)
                output, h_n = self.doc_encoder[i](
                    inputs
                )  # output: B,T,n_direction*n_h_dden  # h_n:n_direction,B,n_h_dden    # lstm hn:(h_n, c_n)
                outputs.append(output)
                h_n = torch.cat(h_n,
                                -1) if self.rnn_type != 'lstm' else torch.cat(
                                    h_n[0], -1)
                hns.append(h_n)
            if self.concat:
                doc_output = torch.cat(
                    outputs[1:], -1
                )  # B,D, n_lay*n_direct*h   each token t :  h1,t->,h1,t <-, h2,t->,h2,t <-,  h3,t->,h3,t <-,
            else:
                doc_output = outputs[
                    -1]  # B,D, n_direct*h         each token t:   h3,t->,h3,t <-,
    # padding
        elif self.args.rnn_padding or not self.training:
            l = torch.sum(dw_mask.eq(0).long(), 1).squeeze(-1)  # B, real len
            sort_len, sort_idx = torch.sort(l, dim=0, descending=True)  # B,
            _, resort = torch.sort(sort_idx,
                                   dim=0)  # resort B's ex to original
            outputs = [doc_input[sort_idx.data]]
            hns = []
            for i in range(len(self.doc_encoder)):
                inputs = outputs[-1]
                pack_inputs = torch.nn.utils.rnn.pack_padded_sequence(
                    inputs, sort_len.data.cpu().numpy(),
                    batch_first=True)  # pack input .  len: numpy/list
                inputs = F.dropout(pack_inputs.data,
                                   training=self.training,
                                   p=self.dropout_rnn)  # dropout
                inputs = torch.nn.utils.rnn.PackedSequence(
                    inputs, pack_inputs.batch_sizes)  # repack
                output, h_n = self.doc_encoder[i](
                    inputs
                )  # output: B,T,n_direction*n_h_dden   # h_n:    n_direction,B,n_h_dden
                output, _ = torch.nn.utils.rnn.pad_packed_sequence(
                    output, batch_first=True)  # real_output, output_len
                outputs.append(output)
                h_n = torch.cat(h_n,
                                -1) if self.rnn_type != 'lstm' else torch.cat(
                                    h_n[0], -1)
                hns.append(h_n)
            if self.concat:
                doc_output = torch.cat(
                    outputs[1:], -1
                )  # B,D, n_lay*n_direct*h   each token t :  h1,t->,h1,t <-, h2,t->,h2,t <-,  h3,t->,h3,t <-,
            else:
                doc_output = outputs[
                    -1]  # B,D, n_direct*h         each token t:   h3,t->,h3,t <-,
            doc_output = doc_output[resort.data]
            # after padding, doc len may shorter,# padding on some dimension in t
            if doc_output.size(1) != dw_mask.size(1):
                padding = torch.zeros(doc_output.size(0),
                                      dw_mask.size(1) - doc_output.size(1),
                                      doc_output.size(2)).type(
                                          doc_output.data.type())
                doc_output = torch.cat([doc_output, Variable(padding)], 1)
        if self.concat:
            doc_h = torch.cat(hns, -1)  # B,n_direc*n_layyer*h
        else:
            doc_h = hns[-1]  # B,n_direc*h

        doc_output = F.dropout(doc_output,
                               training=self.training,
                               p=self.final_dropout
                               )  # B,|D|,n_direc*n_layyer*h /  B,|D|,n_direc*h
        doc_h = F.dropout(
            doc_h, training=self.training,
            p=self.h_output)  # B,n_direc*n_layyer*h / B,n_direc*h

        # question encoder
        # no padding
        if (self.training
                and not self.args.rnn_padding) or qw_mask.data.sum() == 0:
            outputs = [qw_emb]  # B,|Q|,h
            hns = []
            for i in range(len(self.ques_encoder)):
                inputs = outputs[-1]
                # dropout on this layyer
                inputs = F.dropout(inputs,
                                   training=self.training,
                                   p=self.dropout_rnn)
                output, h_n = self.ques_encoder[i](
                    inputs
                )  # output: B,T,n_direction*n_h_dden  # h_n:n_direction,B,n_h_dden    # lstm hn:(h_n, c_n)
                #print(output.size())
                outputs.append(output)
                h_n = torch.cat(h_n,
                                -1) if self.rnn_type != 'lstm' else torch.cat(
                                    h_n[0], -1)
                hns.append(h_n)
            if self.concat:
                ques_output = torch.cat(
                    outputs[1:], -1
                )  # B,Q, n_lay*n_direct*h   each token t :  h1,t->,h1,t <-, h2,t->,h2,t <-,  h3,t->,h3,t <-,
            else:
                ques_output = outputs[
                    -1]  # B,Q, n_direct*h         each token t:   h3,t->,h3,t <-,
    # padding
        elif self.args.rnn_padding or not self.training:
            l = torch.sum(qw_mask.eq(0).long(), 1).squeeze(-1)  # B, real len
            sort_len, sort_idx = torch.sort(l, dim=0, descending=True)  # B,
            _, resort = torch.sort(sort_idx,
                                   dim=0)  # resort B's ex to original
            outputs = [qw_emb[sort_idx.data]]
            hns = []
            for i in range(len(self.ques_encoder)):
                inputs = outputs[-1]
                pack_inputs = torch.nn.utils.rnn.pack_padded_sequence(
                    inputs, sort_len.data.cpu().numpy(),
                    batch_first=True)  # pack input .  len: numpy/list
                inputs = F.dropout(pack_inputs.data,
                                   training=self.training,
                                   p=self.dropout_rnn)  # dropout
                inputs = torch.nn.utils.rnn.PackedSequence(
                    inputs, pack_inputs.batch_sizes)  # repack
                output, h_n = self.ques_encoder[i](
                    inputs
                )  # output: B,T,n_direction*n_h_dden   # h_n:    n_direction,B,n_h_dden
                output, _ = torch.nn.utils.rnn.pad_packed_sequence(
                    output, batch_first=True)  # real_output, output_len
                #print(output.size())
                outputs.append(output)
                h_n = torch.cat(h_n,
                                -1) if self.rnn_type != 'lstm' else torch.cat(
                                    h_n[0], -1)
                hns.append(h_n)
            if self.concat:
                ques_output = torch.cat(
                    outputs[1:], -1
                )  # B,T, n_lay*n_direct*h   each token t :  h1,t->,h1,t <-, h2,t->,h2,t <-,  h3,t->,h3,t <-,
            else:
                ques_output = outputs[
                    -1]  # B,T, n_direct*h         each token t:   h3,t->,h3,t <-,
            ques_output = ques_output[resort.data]
            # after padding, doc len may shorter,# padding on some dimension in t
            if ques_output.size(1) != qw_mask.size(1):
                padding = torch.zeros(ques_output.size(0),
                                      qw_mask.size(1) - ques_output.size(1),
                                      ques_output.size(2)).type(
                                          ques_output.data.type())
                ques_output = torch.cat([ques_output, Variable(padding)], 1)
        if self.concat:
            ques_h = torch.cat(hns, -1)  # B,n_direc*n_layyer*h
        else:
            ques_h = hns[-1]  # B,n_direc*h

        ques_output = F.dropout(
            ques_output, training=self.training, p=self.final_dropout
        )  # B,|Q|,n_direc*n_layyer*h /  B,|Q|,n_direc*h
        ques_h = F.dropout(
            ques_h, training=self.training,
            p=self.h_output)  # B,q_h:   B,n_direc*n_layyer*h/  B,n_direc*h
        # give different q_token different weight
        if self.args.q_self_weight:
            #print(self.h)
            #print(self.ques_output_size)
            #print(ques_output.size())  # B,T, n_lay*n_direct*h
            self_score = self.q_self_Linear(
                ques_output.view(-1, self.ques_output_size)).squeeze(-1).view(
                    B,
                    -1)  # B*|Q|,h    h,1 -->  B,Q   each q token's self score
            self_score = self_score.clone()
            self_score[qw_mask.data] = -float('inf')
            #self_score.data.masked_fill_(qw_mask.data,-float('inf'))
            self_score = F.softmax(self_score)  # B,|Q|
            ques_final = torch.bmm(
                self_score.unsqueeze(1), ques_output
            ).squeeze(
                1
            )  # B,1,|Q|  * B,|Q|,q_h  --> B,q_h  can use ques_final/ ques_h

    # Q encoder
    # no padding
        if (self.training
                and not self.args.rnn_padding) or Qw_mask.data.sum() == 0:
            outputs = [Qw_emb]  # n_Q,|Q_tokens|,h
            hns = []
            for i in range(len(self.Q_encoder)):
                inputs = outputs[-1]
                # dropout on this layyer
                inputs = F.dropout(inputs,
                                   training=self.training,
                                   p=self.dropout_rnn)
                output, h_n = self.Q_encoder[i](
                    inputs
                )  # output: B,T,n_direction*n_h_dden  # h_n:n_direction,B,n_h_dden    # lstm hn:(h_n, c_n)
                outputs.append(output)
                h_n = torch.cat(h_n,
                                -1) if self.rnn_type != 'lstm' else torch.cat(
                                    h_n[0], -1)
                hns.append(h_n)
            if self.concat:
                Q_output = torch.cat(
                    outputs[1:], -1
                )  # B,Q, n_lay*n_direct*h   each token t :  h1,t->,h1,t <-, h2,t->,h2,t <-,  h3,t->,h3,t <-,
            else:
                Q_output = outputs[
                    -1]  # B,Q, n_direct*h         each token t:   h3,t->,h3,t <-,

    # padding
        elif self.args.rnn_padding or not self.training:
            l = torch.sum(Qw_mask.eq(0).long(), 1).squeeze(-1)  # B, real len
            sort_len, sort_idx = torch.sort(l, dim=0, descending=True)  # B,
            _, resort = torch.sort(sort_idx,
                                   dim=0)  # resort B's ex to original
            outputs = [Qw_emb[sort_idx.data]]
            hns = []
            for i in range(len(self.Q_encoder)):
                inputs = outputs[-1]
                pack_inputs = torch.nn.utils.rnn.pack_padded_sequence(
                    inputs, sort_len.data.cpu().numpy(),
                    batch_first=True)  # pack input .  len: numpy/list
                inputs = F.dropout(pack_inputs.data,
                                   training=self.training,
                                   p=self.dropout_rnn)  # dropout
                inputs = torch.nn.utils.rnn.PackedSequence(
                    inputs, pack_inputs.batch_sizes)  # repack
                output, h_n = self.Q_encoder[i](
                    inputs
                )  # output: B,T,n_direction*n_h_dden   # h_n:    n_direction,B,n_h_dden
                output, _ = torch.nn.utils.rnn.pad_packed_sequence(
                    output, batch_first=True)  # real_output, output_len
                outputs.append(output)
                h_n = torch.cat(h_n,
                                -1) if self.rnn_type != 'lstm' else torch.cat(
                                    h_n[0], -1)
                hns.append(h_n)
            if self.concat:
                Q_output = torch.cat(
                    outputs[1:], -1
                )  # B,T, n_lay*n_direct*h   each token t :  h1,t->,h1,t <-, h2,t->,h2,t <-,  h3,t->,h3,t <-,
            else:
                Q_output = outputs[
                    -1]  # B,T, n_direct*h         each token t:   h3,t->,h3,t <-,
            Q_output = Q_output[resort.data]
            # after padding, doc len may shorter,# padding on some dimension in t
            if Q_output.size(1) != Qw_mask.size(1):
                padding = torch.zeros(Q_output.size(0),
                                      Qw_mask.size(1) - Q_output.size(1),
                                      Q_output.size(2)).type(
                                          Q_output.data.type())
                Q_output = torch.cat([Q_output, Variable(padding)], 1)
        if self.concat:
            Q_h = torch.cat(hns, -1)  # |n_Q|,n_direc*n_layyer*h
        else:
            Q_h = hns[-1]  # |n_Q|,n_direc*h

        Q_output = F.dropout(
            Q_output, training=self.training, p=self.final_dropout
        )  # n_Q,|Q_tokens|,n_direc*n_layyer*h /  n_Q,|Q_tokens|,n_direc*h
        Q_h = F.dropout(
            Q_h, training=self.training,
            p=self.h_output)  # n_Q, n_direc*n_layyer*h/  n_Q ,n_direc*h

        # Q2d
        wQ = self.Q2doc(Q_h)  # n_Q,h_Q *    h_Q,h_d --> n_Q, h_d
        #print(type(Q_mask))
        trans_Q = np.zeros([B, Q_mask.size(1), self.doc_output_size],
                           dtype='float32')
        trans_Q = to_var(
            trans_Q, self.args.cuda
        )  # B,max_Q,h_d     *   doc B,h,1  -->   B,max_Q, with mask
        for ex_id in range(B):
            start, end = ex2Q[ex_id]  # Q's pos range in all Q
            trans_Q[ex_id, :end - start, :] = wQ[start:end, :].clone()
        pure_Q = torch.bmm(trans_Q, doc_h.unsqueeze(2)).squeeze(
            2)  # B,max_Q   Q_mask,same size      Q*W*D
        # q 2 each d
        ques_final = ques_h  # ques_h /  ques_final    B,h_q --> B,1,h_d
        if self.args.train_mode == 'string_match_base_dis':
            score_s = torch.bmm(
                self.q2doc_s(ques_final).unsqueeze(1),
                doc_output.transpose(1, 2)).squeeze(
                    1)  #  B,1,h_d * B,h_d,|D| --> B,1,|D|-->  B,|D|
            score_e = torch.bmm(
                self.q2doc_e(ques_final).unsqueeze(1),
                doc_output.transpose(1, 2)).squeeze(
                    1)  #  B,1,h_d * B,h_d,|D| --> B,1,|D|-->  B,|D|

            score_s = score_s.clone()
            score_s[dw_mask.data] = -float('inf')
            score_e = score_e.clone()
            score_e[dw_mask.data] = -float('inf')

            # score_s.data.masked_fill_(dw_mask.data,-float('inf'))
            # score_e.data.masked_fill_(dw_mask.data,-float('inf'))
            if self.training:
                score_s = F.log_softmax(score_s)  # B,|D|, to compute B,max_C
                score_e = F.log_softmax(score_e)  # B,|D|, to compute B,max_C
            else:
                score_s = torch.exp(score_s)  # B,|D|
                score_e = torch.exp(score_e)  # B,|D|
            #print(score_e)
            #print(score_s)
            assert torch.sum(isnan(score_s.data.cpu())) == 0
            assert torch.sum(isnan(score_e.data.cpu())) == 0
            return score_s, score_e, pure_Q  # pure_Q   B,max_Q, before mask

        if self.args.train_mode == 'span':
            score_s = torch.bmm(
                self.q2doc_s(ques_final).unsqueeze(1),
                doc_output.transpose(1, 2)).squeeze(
                    1)  #  B,1,h_d * B,h_d,|D| --> B,1,|D|-->  B,|D|
            score_e = torch.bmm(
                self.q2doc_e(ques_final).unsqueeze(1),
                doc_output.transpose(1, 2)).squeeze(
                    1)  #  B,1,h_d * B,h_d,|D| --> B,1,|D|-->  B,|D|
            #print(ques_final)  # hd:768    D:   B:64
            #print(doc_output)  # ~
            #print(score_s)
            #print(score_e)
            # print({'score_e_before_mask':score_e})
            # print({'dw_mask':dw_mask})
            score_s = score_s.clone()
            score_s[dw_mask.data] = -float('inf')
            score_e = score_e.clone()
            score_e[dw_mask.data] = -float('inf')
            #print(score_e)
            #print(score_s)
            # print({'score_e_before_softmax':score_e})
            #score_s.data.masked_fill_(dw_mask.data,-float('inf'))
            #score_e.data.masked_fill_(dw_mask.data,-float('inf'))
            if self.training or self.normalize:
                score_s = F.softmax(score_s)
                score_e = F.softmax(score_e)
            else:
                score_s = torch.exp(score_s)  # B,|D|
                score_e = torch.exp(score_e)  # B,|D|
            # print({'score_e_after_softmax':score_e})
            #print(score_e)
            #print(score_s)
            # assert torch.sum(isnan(score_e.data.cpu()))==0
            # assert torch.sum(isnan(score_s.data.cpu()))==0
            return score_s, score_e, pure_Q  #,ques_final,doc_output

        if self.args.train_mode == 'contain' or self.args.train_mode == 'NER':
            score = torch.bmm(
                self.q2doc(ques_final).unsqueeze(1),
                doc_output.transpose(1, 2)).squeeze(
                    1)  #  B,1,h_d * B,h_d,|D| --> B,1,|D|-->  B,|D|
            # score.data.masked_fill_(dw_mask.data,-float('inf'))
            score = score.clone()
            score[dw_mask.data] = -float('inf')
            if self.training or self.normalize:
                score = F.softmax(score)
            else:
                score = torch.exp(score)
            assert torch.sum(isnan(score.data.cpu())) == 0
            return score, pure_Q
Example #23
0
def plot_followup_improvements():

    data_path = '~/Faculdade/Mestrado/Projeto/scripts/Working Scripts/'
    data_path = data_path + 'EXPERIMENT_DOWNLOAD/Group_patients-with-brachial-plexus-injury/Per_questionnaire_data/'
    #data_path = data_path + 'Q61802_unified-surgical-evaluation/Responses_Q61802.csv'
    data_path = data_path + 'Q92510_unified-follow-up-assessment/Responses_Q92510.csv'

    followup_data = pd.read_csv(
        data_path,
        header=0,
        delimiter=",",
        na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'],
        quoting=0,
        encoding='utf8',
        mangle_dupe_cols=False)

    data_path = '~/Faculdade/Mestrado/Projeto/scripts/Working Scripts/'
    data_path = data_path + 'EXPERIMENT_DOWNLOAD/Group_patients-with-brachial-plexus-injury/Per_questionnaire_data/'
    data_path = data_path + 'Q44071_unified-admission-assessment/Responses_Q44071.csv'

    admission_data = pd.read_csv(
        data_path,
        header=0,
        delimiter=",",
        na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'],
        quoting=0,
        encoding='utf8',
        mangle_dupe_cols=False)
    print(admission_data.shape)
    print(followup_data.shape)
    return_value = {}
    return_period = {}
    surgery_patients = []
    injury_side_column = admission_data.filter(like='opcLdLesao').columns[0]
    merged_data = admission_data.merge(followup_data,
                                       how='inner',
                                       on='participant code',
                                       suffixes=('_a', '_f'))
    for ix, row in merged_data.iterrows():

        if row['participant code'] in return_value.keys():
            if (not utils.isnan(row['opcForca' + row[injury_side_column] +
                                    '[AbdOmbro]_f'])):
                return_value[row['participant code']].append(
                    row['opcForca' + row[injury_side_column] + '[AbdOmbro]_f'])

                if (row['formTempoAval_f'] <
                        return_period[row['participant code']][-1]):
                    return_value[row['participant code']][-1], return_value[
                        row['participant code']][-2] = return_value[
                            row['participant code']][-2], return_value[
                                row['participant code']][-1]
                    tmp = return_period[row['participant code']][-1]
                    return_period[
                        row['participant code']][-1] = row['formTempoAval_f']
                    return_period[row['participant code']].append(tmp)
                else:
                    return_period[row['participant code']].append(
                        row['formTempoAval_f'])
        else:
            if (not utils.isnan(row['opcForca' + row[injury_side_column] +
                                    '[AbdOmbro]_a'])):
                return_value[row['participant code']] = [
                    row['opcForca' + row[injury_side_column] + '[AbdOmbro]_a']
                ]
                return_period[row['participant code']] = [
                    row['formTempoAval_a']
                ]

                if (not utils.isnan(row['opcForca' + row[injury_side_column] +
                                        '[AbdOmbro]_f'])):
                    return_value[row['participant code']].append(
                        row['opcForca' + row[injury_side_column] +
                            '[AbdOmbro]_f'])
                    return_period[row['participant code']].append(
                        row['formTempoAval_f'])
        if (row['snCplexoAt_a'] == 'S' or row['snCplexoAt_f'] == 'S'):
            surgery_patients.append(row['participant code'])

    spatients_to_plot = []
    speriods_to_plot = []
    nspatients_to_plot = []
    nsperiods_to_plot = []
    for patient in return_value.keys():
        if (len(return_value[patient]) >= 3):
            if (patient in surgery_patients):
                spatients_to_plot.append(return_value[patient])
                speriods_to_plot.append(return_period[patient])
            else:
                nspatients_to_plot.append(return_value[patient])
                nsperiods_to_plot.append(return_period[patient])

    print(min([b for a in return_period.values() for b in a]))
    print(max([b for a in return_period.values() for b in a]))
    exit()

    for j in range(0, len(spatients_to_plot), 5):
        ax = plt.subplot(111)
        plt.axis((0, 3000, -1, 6))
        for i in range(j, j + 5):
            if (i < len(spatients_to_plot)):
                ax.plot(speriods_to_plot[i], spatients_to_plot[i],
                        'x-')  #,color=colors[i])
            else:
                break
        plt.show()

    ax = plt.subplot(111)
    plt.axis((0, 3000, -1, 6))
    for i in range(len(nspatients_to_plot)):
        ax.plot(nsperiods_to_plot[i], nspatients_to_plot[i], 'x-')
    plt.show()
Example #24
0
    def predict_rec(self, X, node, shuffle_attribute=None):

        # if the node is a class node, then it should return the class distribution
        if node.is_class:

            d = {}
            # sum of class distributions (absolute values)
            s = sum(node.distr.values())
            # for each class
            for k in node.distr.keys():
                # if s == 0, then there are no instances at the node - which means that
                # it's a decision node coming from a nan branch (branch_nan)
                if (s == 0):
                    # adds 1 to the final_class (most probable one) in case of a possible
                    # future classification of an instance that ends up at this final node
                    d[node.final_class] = 1
                    return d
                # returns the node distribution (relative values)
                else:
                    d[k] = node.distr[k] / s

            return d

        # if the value of the node feature should be permuted on the instance
        if (shuffle_attribute is not None
                and node.feature_index == shuffle_attribute):

            # list of probabilities to randomly assign the instance to the node branches
            probs = []
            # for each node that descend from the branches (except the last one)
            for j in range(len(node.branches) - 1):
                # add to the list the probability that the instance would end up at the node
                # if it was randomly assigned to it - that is, the number of instances at the node
                # divided by the number of instances at its parent's node (round to 5 decimal digits).
                probs.append(
                    round(
                        sum(node.branches[j].distr.values()) /
                        sum(node.distr.values()), 5))

            # if there is a branch for missing values at the node
            if (node.branch_nan is not None):
                # add to the list the probability that the instanece would end up at the node from the last branch
                probs.append(
                    round(
                        sum(node.branches[len(node.branches) -
                                          1].distr.values()) /
                        sum(node.distr.values()), 5))

            # if the sum of probabilities exceeded 1
            if (1 - sum(probs) < 0):
                #change the last probability to be 1 - the sum of probabilities
                probs[-1] = round(1 - sum(probs[:-1]), 5)
                # the last branch (or the nan branch, if it exists) will be assigned with probability 0
                probs.append(0)
            # probability for the last branch (or the nan branch, if it exists)
            else:
                probs.append(1 - sum(probs))

            # randomly select the branch according the the probabilities calculated above
            i = np.random.choice(range(len(probs)), p=probs)

            # if the nan branch was selected, continue the prediction running the instance
            # through that branch
            if (i == len(node.branches) and node.branch_nan is not None):
                return self.predict_rec(X, node.branch_nan, shuffle_attribute)
            # continue the prediction running the instance through the randomly chosen branch
            else:
                return self.predict_rec(X, node.branches[i], shuffle_attribute)

        # if the value of instance X for the feature on the node is missing
        if (utils.isnan(X[node.feature_index])):
            # if there isn't a nan branch (C4.5 approach)
            if (node.branch_nan is None):
                # list of possible outcomes
                distr = []
                # list of relative distribution of possible outcomes
                prob_branch = []
                # add to the list of possible outcomes the prediction of the instance
                # through each one of the branches
                for n in node.branches:
                    distr.append(self.predict_rec(X, n, shuffle_attribute))
                    prob_branch.append(
                        sum(n.distr.values()) / sum(node.distr.values()))

                d = {}
                # for each possible class at the node
                for k in node.distr.keys():
                    d[k] = 0
                    # for each possible outcome, add to the distribution the
                    # probability of that outcome
                    for i in range(len(distr)):
                        d[k] += prob_branch[i] * distr[i][k]
                return d
            # if there is a branch for the missing values
            else:
                # continue prediction through the nan branch
                y = self.predict_rec(X, node.branch_nan, shuffle_attribute)

        # if the value of instance X for the node feature is not missing and
        # it corresponds to a numeric feature (len(node.values) = 1)
        elif len(node.values) == 1:
            # if the value of instance X for the node feature is less than the
            # value to compare, then continue the prediction through the left
            # branch (node.branches[0]).
            if (X[node.feature_index] <= node.values[0]):
                y = self.predict_rec(X, node.branches[0], shuffle_attribute)
            # else continue through the right branch (node.branches[1])
            else:
                y = self.predict_rec(X, node.branches[1], shuffle_attribute)
        # if the node feature is categorical
        else:
            # node.values.index(str(X[node.feature_index])) should return the
            # index of the value of X for the node feature on the node.values list,
            # but if it can't find it, it means that this value hasn't been seen yet
            # (none of the instances used to train the tree had that value). In that case,
            # it'll raise an ValueError.
            try:
                y = self.predict_rec(
                    X, node.branches[node.values.index(
                        str(X[node.feature_index]))], shuffle_attribute)
            except (ValueError):
                # if the value hasn't been seen at the training phase, then it'll be considered as a missing value.
                # if there is a nan branch, continue prediction through it
                if (node.branch_nan is not None):
                    y = self.predict_rec(X, node.branch_nan, shuffle_attribute)
                # if there isn't a nan branch, then use C4.5 approach
                else:
                    distr = []
                    prob_branch = []
                    for n in node.branches:
                        distr.append(self.predict_rec(X, n, shuffle_attribute))
                        prob_branch.append(
                            sum(n.distr.values()) / sum(node.distr.values()))
                    d = {}
                    for k in node.distr.keys():
                        d[k] = 0
                        for i in range(len(distr)):
                            d[k] += prob_branch[i] * distr[i][k]
                    return d

        return y
Example #25
0
def plot_followup_pain():
    data_path = '~/Faculdade/Mestrado/Projeto/scripts/Working Scripts/'
    data_path = data_path + 'EXPERIMENT_DOWNLOAD/Group_patients-with-brachial-plexus-injury/Per_questionnaire_data/'
    #data_path = data_path + 'Q61802_unified-surgical-evaluation/Responses_Q61802.csv'
    data_path = data_path + 'Q92510_unified-follow-up-assessment/Responses_Q92510.csv'
    #data_path = data_path + 'Q44071_unified-admission-assessment/Responses_Q44071.csv'

    data = pd.read_csv(data_path,
                       header=0,
                       delimiter=",",
                       na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'],
                       quoting=0,
                       encoding='utf8',
                       mangle_dupe_cols=False)

    # admission_data = pd.read_csv('~/Faculdade/Mestrado/Projeto/scripts/Working Scripts/Dor.csv', header=0, delimiter=",",
    #         na_values=['N/A', 'None','nan','NAAI','NINA'], quoting=0, encoding='utf8', mangle_dupe_cols=False)

    outcome = 'snDorPos'  #'opcForcaD[FlexCotovelo]' #'snDorPos'
    #outcome_left = 'snDorPos'#'opcForcaE[FlexCotovelo]'#'snDorPos'
    #print(len(([int(a/30) for a in data['formTempoAval']])))
    patients_considered = {}
    patient_outcomes = {}
    #return_periods = []
    for i, row in data.iterrows():
        if (row['participant code']) not in patients_considered:
            patients_considered[row['participant code']] = row['formTempoAval']
            patient_outcomes[row['participant code']] = row[outcome]

            #return_periods.append(row['formTempoAval'])
        else:
            if (row['formTempoAval'] >
                    patients_considered[row['participant code']]):
                if (row[outcome] != 'NINA' and not utils.isnan(row[outcome])):
                    patient_outcomes[row['participant code']] = row[outcome]
                    patients_considered[
                        row['participant code']] = row['formTempoAval']

            else:
                if (utils.isnan(patient_outcomes[row['participant code']])):
                    if (row[outcome] != 'NINA'
                            and not utils.isnan(row[outcome])):
                        patient_outcomes[
                            row['participant code']] = row[outcome]
                        patients_considered[
                            row['participant code']] = row['formTempoAval']

                #print(row['participant code'])
    #import pdb
    #pdb.set_trace()
    #labels = {'S':'Sim','N':'Não',np.nan:'Não informado'}
    for k in patients_considered.keys():
        patients_considered[k] = int(patients_considered[k] / 30)

    xlabels = ['N', 'S', np.nan]
    labels = {'S': 'Sim', 'N': 'Não', np.nan: 'Não informado'}
    y = [(Counter(patient_outcomes.values())[x]) for x in xlabels]
    width = 0.8
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.bar(range(len(xlabels)), y, width=width)
    ax.set_xticks(np.arange(len(xlabels)) + width / 2)
    ax.set_xticklabels([labels[l] for l in xlabels])
    plt.xlabel('Sente dor após a lesão?')
    plt.show()
Example #26
0
    def find_split(self, X, y, feature_indices, weights):

        best_gain = -float('inf')
        best_feature_index = -1
        best_value = [0]

        # for each feature to be considered
        for feature_index in sorted(feature_indices):
            # get rows of instances with known values for the feature
            not_nan_rows = [
                a for a in range(X.shape[0])
                if not utils.isnan(X[:, feature_index][a])
            ]

            Xnotnan = (X[not_nan_rows, :])
            ynotnan = y[not_nan_rows]

            #if there aren't any instances with known values for the feature, go to the next one
            if (Xnotnan.shape[0] == 0):
                continue

            # get all possible values for the feature index
            values = sorted(set(Xnotnan[:, feature_index]))

            # if the values are numeric
            if (utils.isnum(Xnotnan[0, feature_index])):

                # split the data using each value
                for j in range(len(values) - 1):

                    #value = (float(values[j]) + float(values[j+1]))/2 -- original
                    value = values[j]
                    # split data using the feature and the value
                    Xs, ys, d = utils.split_num(Xnotnan, ynotnan,
                                                feature_index, value)
                    # calculate gain considering the rate of missing values.
                    # the bigger the rate, the smaller the gain
                    gain = (len(ynotnan) / len(y)) * utils.information_gain(
                        ynotnan, ys)

                    if gain >= best_gain:
                        # if there's a tie on info gain, decide using gain ratio
                        # if(gain == best_gain and best_feature_index != -1):
                        #     print('tie of gain')
                        #     gr = utils.gain_ratio(ynotnan,ys,y)
                        #     not_nan_rows = [a for a in range(X.shape[0]) if not utils.isnan(X[:,best_feature_index][a])]
                        #     Xss,yss, ds = utils.split(X[not_nan_rows,:],y[not_nan_rows],best_feature_index,best_value)
                        #     # calculate gain ratio of previous best feature to compare
                        #     gr_p = utils.gain_ratio(ynotnan,yss,y)
                        #     # if the current feature's gain ratio is not better than the previous one, then
                        #     # go to the next feature
                        #     if(gr < gr_p):
                        #         continue

                        best_gain = gain
                        best_feature_index = feature_index
                        best_value = [
                            values[j]
                        ]  #c4.5 choses the largest value in the trainig set that
                        #does not exceed the midpoint (value). This ensures that all
                        #threshold values appearing in trees actually occur in the data
            # if the values are categorical
            else:
                # split the data using the values
                Xs, ys, d = utils.split_categ(Xnotnan, ynotnan, feature_index,
                                              values)

                gain = ((len(ynotnan) / len(y)) *
                        utils.information_gain(ynotnan, ys)
                        )  #utils.gain_ratio(ynotnan,ys,y))

                if gain >= best_gain:
                    # if(gain == best_gain and best_feature_index != -1):
                    #     print('tie of gain')
                    #     gr = utils.gain_ratio(ynotnan,ys,y)
                    #     not_nan_rows = [a for a in range(X.shape[0]) if not utils.isnan(X[:,best_feature_index][a])]
                    #     Xss,yss, ds = utils.split(X[not_nan_rows,:],y[not_nan_rows],best_feature_index,best_value)
                    #     gr_p = utils.gain_ratio(ynotnan,yss,y)
                    #     if(gr < gr_p):
                    #         continue

                    best_gain = gain
                    best_feature_index = feature_index
                    best_value = values

        return best_feature_index, best_value
Example #27
0
 def _sanity_check(self, ground_metric_matrix):
     assert not (ground_metric_matrix < 0).any()
     assert not (isnan(ground_metric_matrix).any())