Example #1
0
 def _spearman(a, b):
     return spearmanr(a, b)[0]
Example #2
0
def test_ranker(output, client, listen_port, group):

    if output == 'dataframe-with-categorical':
        X, y, w, g, dX, dy, dw, dg = _create_ranking_data(
            output=output,
            group=group,
            n_features=1,
            n_informative=1
        )
    else:
        X, y, w, g, dX, dy, dw, dg = _create_ranking_data(
            output=output,
            group=group,
        )

    # rebalance small dask.Array dataset for better performance.
    if output == 'array':
        dX = dX.persist()
        dy = dy.persist()
        dw = dw.persist()
        dg = dg.persist()
        _ = wait([dX, dy, dw, dg])
        client.rebalance()

    # use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of
    # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210.
    params = {
        "random_state": 42,
        "n_estimators": 50,
        "num_leaves": 20,
        "min_child_samples": 1
    }

    dask_ranker = lgb.DaskLGBMRanker(
        client=client,
        time_out=5,
        local_listen_port=listen_port,
        tree_learner_type='data_parallel',
        **params
    )
    dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
    rnkvec_dask = dask_ranker.predict(dX)
    rnkvec_dask = rnkvec_dask.compute()
    p1_pred_leaf = dask_ranker.predict(dX, pred_leaf=True)
    rnkvec_dask_local = dask_ranker.to_local().predict(X)

    local_ranker = lgb.LGBMRanker(**params)
    local_ranker.fit(X, y, sample_weight=w, group=g)
    rnkvec_local = local_ranker.predict(X)

    # distributed ranker should be able to rank decently well and should
    # have high rank correlation with scores from serial ranker.
    dcor = spearmanr(rnkvec_dask, y).correlation
    assert dcor > 0.6
    assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8
    assert_eq(rnkvec_dask, rnkvec_dask_local)

    # pref_leaf values should have the right shape
    # and values that look like valid tree nodes
    pred_leaf_vals = p1_pred_leaf.compute()
    assert pred_leaf_vals.shape == (
        X.shape[0],
        dask_ranker.booster_.num_trees()
    )
    assert np.max(pred_leaf_vals) <= params['num_leaves']
    assert np.min(pred_leaf_vals) >= 0
    assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']

    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature
    if output == 'dataframe-with-categorical':
        cat_cols = [
            col for col in dX.columns
            if dX.dtypes[col].name == 'category'
        ]
        tree_df = dask_ranker.booster_.trees_to_dataframe()
        node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
        assert node_uses_cat_col.sum() > 0
        assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
Example #3
0
def spearmanr(a, b):
    return stats.spearmanr(a, b)[0]
Example #4
0
def main(data_dir, model_file, out_format, word_analogy, word_similarity, entity_similarity,
         lowercase, batch_size, vocab_size):
    model = Wikipedia2Vec.load(model_file)

    results = []

    if word_similarity:
        base_dir = os.path.join(os.path.join(data_dir, 'word'), 'similarity')
        for filename in os.listdir(base_dir):
            if not filename.endswith('.txt'):
                continue

            oov_count = 0
            with open(os.path.join(base_dir, filename)) as f:
                gold = []
                estimated = []
                for line in f:
                    (w1, w2, val) = line.split()
                    val = float(val)
                    if lowercase:
                        (w1, w2) = (w1.lower(), w2.lower())
                    try:
                        v1 = model.get_word_vector(w1)
                    except KeyError:
                        oov_count += 1
                        continue
                    try:
                        v2 = model.get_word_vector(w2)
                    except KeyError:
                        oov_count += 1
                        continue

                    gold.append(val)
                    estimated.append(1.0 - cosine(v1, v2))

                results.append((filename[:-4], spearmanr(gold, estimated)[0], oov_count))

    if word_analogy:
        base_dir = os.path.join(os.path.join(data_dir, 'word'), 'analogy')
        for filename in os.listdir(base_dir):
            with open(os.path.join(base_dir, filename)) as f:
                (A_ind, B_ind, C_ind, D_ind) = ([], [], [], [])
                oov_count = 0
                for (n, line) in enumerate(f):
                    if not line.startswith(':'):
                        if lowercase:
                            words = list(map(model.get_word, line.lower().split()))
                        else:
                            words = list(map(model.get_word, line.split()))
                        if not all(w is not None for w in words):
                            oov_count += 1
                            continue

                        (a_ind, b_ind, c_ind, d_ind) = map(lambda o: o.index, words)
                        A_ind.append(a_ind)
                        B_ind.append(b_ind)
                        C_ind.append(c_ind)
                        D_ind.append(d_ind)

                offset = model.dictionary.entity_offset
                word_emb = model.syn0[:offset] / np.linalg.norm(model.syn0[:offset], 2, axis=1, keepdims=True)

                (A, B, C) = (word_emb[A_ind], word_emb[B_ind], word_emb[C_ind])
                D = (B - A + C)
                del A, B, C

                predictions = []

                for i in trange(0, D.shape[0], batch_size, desc=filename[:-4]):
                    D_batch = D[i:i+batch_size]
                    dot_ret = np.dot(word_emb, D_batch.T)
                    for (j, indices) in enumerate(zip(A_ind[i:i+batch_size], B_ind[i:i+batch_size],
                                                      C_ind[i:i+batch_size])):
                        dot_ret[indices, j] = float('-inf')
                    predictions.append(np.argmax(dot_ret, 0))

                results.append((filename[:-4], np.mean(np.hstack(predictions) == D_ind), oov_count))

    if entity_similarity:
        category_mapping = {e: c for (c, l) in KORE_CATEGORIES.items() for e in l}

        base_dir = os.path.join(os.path.join(data_dir, 'entity'), 'similarity')
        for filename in os.listdir(base_dir):
            with open(os.path.join(base_dir, filename)) as f:
                if filename == 'KORE.txt':
                    data = defaultdict(list)
                    title = None
                    for line in f:
                        line = line.rstrip()
                        if line.startswith('\t'):
                            data[title].append(line[1:])
                        else:
                            title = line

                    kore_results = defaultdict(list)
                    oov_count = 0
                    for (title, title_list) in data.items():
                        try:
                            v1 = model.get_entity_vector(title)
                        except KeyError:
                            oov_count += len(title_list)
                            continue

                        estimated = []
                        for title2 in title_list:
                            try:
                                v2 = model.get_entity_vector(title2)
                            except KeyError:
                                oov_count += 1
                                continue
                            estimated.append(1.0 - cosine(v1, v2))

                        gold = list(reversed(range(len(estimated))))
                        kore_results[category_mapping[title]].append(spearmanr(gold, estimated)[0])

                    results.append((filename[:-4], np.mean(list(chain(*kore_results.values()))), oov_count))

                else:
                    gold = []
                    estimated = []
                    oov_count = 0
                    for (n, line) in enumerate(f):
                        if n == 0:
                            continue
                        line = line.rstrip()
                        (_, _, title1, _, _, title2, score) = line.split('\t')

                        try:
                            v1 = model.get_entity_vector(title1.replace('_', ' '))
                        except KeyError:
                            oov_count += 1
                            continue
                        try:
                            v2 = model.get_entity_vector(title2.replace('_', ' '))
                        except KeyError:
                            oov_count += 1
                            continue

                        gold.append(float(score))
                        estimated.append(1.0 - cosine(v1, v2))

                    results.append((filename[:-4], spearmanr(gold, estimated)[0], oov_count))

    if out_format == 'text':
        for (name, score, oov_count) in results:
                print('%s: ' % name)
                print('  Spearman score: %.4f' % score)
                print('  OOV instances: %d' % oov_count)

    elif out_format == 'csv':
        print('name,' + ','.join([o[0] for o in results]))
        print('score,' + ','.join(['%.4f' % o[1] for o in results]))
        print('oov,' + ','.join(['%d' % o[2] for o in results]))
# 독립 샘플에 대한 평균 비교 two independent samples of scores
dt = np.array([
    24, 43, 58, 71, 43, 49, 61, 44, 67, 49, 53, 56, 59, 52, 62, 54, 57, 33, 46,
    43, 57
])
dc = np.array([
    42, 43, 55, 26, 62, 37, 33, 41, 19, 54, 20, 85, 46, 10, 17, 60, 53, 42, 37,
    42, 55, 28, 48
])
print(stats.ttest_ind(dt, dc))
# pvalue=0.02
print(stats.ttest_ind(dt, dc, equal_var=False))  #등분산성이 아닌 경우
print(stats.jarque_bera(dt))  #정규분포와 일치하는가
print(stats.jarque_bera(dc))
print(stats.pearsonr(dt, dc))  #피어슨 상관계수
print(stats.spearmanr(dt, dc))
print(stat.kendalltau(dt, dc))

import matplotlib.pyplot as plt

read_file = pd.read_csv('play_13_14_top30.csv', skiprows=1)
read_file.describe()
read_file.head()

a = read_file.describe()
a.boxplot()
plt.show()

re_file = read_file.rename(
    columns={
        'P': 'points',
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt

trainingDataSet = pd.read_csv("A3_training_dataset.tsv",
                              delimiter="\t",
                              header=None)
testDataSet = pd.read_csv("A3_test_dataset.tsv", delimiter="\t", header=None)

classLabel = trainingDataSet.iloc[:, -1]
trainingCorrData = trainingDataSet.iloc[:, :-1]

correlation, pValue = stats.spearmanr(trainingCorrData)
columns = np.full((correlation.shape[0], ), True, dtype=bool)
for i in range(correlation.shape[0]):
    for j in range(i + 1, correlation.shape[0]):
        if correlation[
                i,
                j] > 0.2:  # Features below this threshold value are eliminated
            if columns[j]:
                columns[j] = False

columns_Selected = trainingCorrData.columns[columns]
print("Columns selected:", len(columns_Selected))
trainingData = pd.DataFrame(trainingCorrData[columns_Selected])
trainingData.insert(loc=len(columns_Selected),
                    column="class",
                    value=classLabel)
def _rank(x, y):
    return spearmanr(x, y).correlation
def correlation(a, b, method=EnumMethod.kendall):
    '''(list|ndarray, list|ndarray, enumeration, enumeration) -> dict
    Returns a dictionary: {'teststat':teststat, 'p':pval}
    method: is an enumeration member of EnumMethod
    engine: is an enumeration method
    scipy cant cope with nans. Matched nans will be removed if a and b are numpy arrays
    '''

    if isinstance(a, _np.ndarray) or isinstance(b, _np.ndarray):
        if isinstance(a, _np.ndarray) is False or isinstance(b, _np.ndarray) is False:
            raise ValueError(
                'If numpy arrays are used, both must be ndarray types')

        if a.shape != b.shape:
            raise ValueError('Numpy array shapes must match exactly')

        # scipy doesnt like nans. We drop out paired nans, leaving
        # all other pairings the same
        if _arraylib.np_contains_nan(a) and _arraylib.np_contains_nan(b):
            dic = _arraylib.np_delete_paired_nans_flattened(a, b)
        else:
            dic = {'a': a, 'b': b}

        # we have unmatched nans, ie a nan in one array
        # with a scalar in the other
        # this is an error state - could modify later to exclude
        # all values from both arrays where there is any nan
        if _arraylib.np_contains_nan(dic['a']):
            raise ValueError('Numpy array a contains NaNs')

        if _arraylib.np_contains_nan(dic['b']):
            raise ValueError('Numpy array b contains NaNs')

        lst_a = dic['a'].flatten().tolist()
        lst_b = dic['b'].flatten().tolist()
    else:
        if isinstance(a, list) is False or isinstance(b, list) is False:
            raise ValueError('If lists are used, both must be list types')
        lst_a = copy.deepcopy(a)
        lst_b = copy.deepcopy(b)

    if len(lst_a) != len(lst_b):
        raise ValueError('Array lengths must match exactly')

    assert isinstance(lst_a, list)
    assert isinstance(lst_b, list)

    for case in _baselib.switch(method):
        if case(EnumMethod.kendall):
            teststat, pval = _stats.kendalltau(lst_a, lst_b)
            break
        if case(EnumMethod.pearson):
            teststat, pval = _stats.pearsonr(lst_a, lst_b)
            break
        if case(EnumMethod.spearman):
            #if engine == EnumStatsEngine.r:
            #    df = _pd.DataFrame({'a': lst_a, 'b': lst_b})
            #    df_r = _rpy2.robjects.pandas2ri(df)
            #    _ro.globalenv['cordf'] = df_r
            #    tmpstr = 'cor.test(cordf$a, cordf$b, method="spearman")'
            #    result = _ro.r(tmpstr)
            #    teststat = result[3][0]
            #    pval = result[2][0]
            #else:
            teststat, pval = _stats.spearmanr(lst_a, lst_b)
            break
        if case():
            raise ValueError('Enumeration member not in e_method')

    return {'teststat': teststat, 'p': pval}
def SFC_by_tissue_seg(structure_file_path, function_file_path,
                      electrode_localization_by_atlas_file_path,
                      electrode_localization_by_classification_atlas_file_path,
                      outputfile):
    #Get functional connecitivty data in pickle file format
    with open(function_file_path, 'rb') as f:
        broadband, alphatheta, beta, lowgamma, highgamma, electrode_row_and_column_names, order_of_matrices_in_pickle_file = pickle.load(
            f)
    FC_list_global = [broadband, alphatheta, beta, lowgamma, highgamma]

    # set up the dataframe of electrodes to analyze
    final_electrodes = pd.DataFrame(electrode_row_and_column_names,
                                    columns=['electrode_name'])
    final_electrodes = final_electrodes.reset_index()
    final_electrodes = final_electrodes.rename(columns={"index": "func_index"})

    #Get Structural Connectivity data in mat file format. Output from DSI studio
    structural_connectivity_array_global = np.array(
        pd.DataFrame(loadmat(structure_file_path)['connectivity']))

    #Get electrode localization by atlas csv file data. From get_electrode_localization.py
    electrode_localization_by_atlas = pd.read_csv(
        electrode_localization_by_atlas_file_path)

    # Get electrode localization by classification atlas
    electrode_localization_by_class_atlas = pd.read_csv(
        electrode_localization_by_classification_atlas_file_path)

    # normalizing and log-scaling the structural matrices
    structural_connectivity_array_global[structural_connectivity_array_global
                                         == 0] = 1
    structural_connectivity_array_global = np.log10(
        structural_connectivity_array_global
    )  # log-scaling. Converting 0s to 1 to avoid taking log of zeros
    structural_connectivity_array_global = structural_connectivity_array_global / np.max(
        structural_connectivity_array_global)  # normalization

    #Only consider electrodes that are in both the localization and the pickle file
    final_electrodes = final_electrodes.merge(
        electrode_localization_by_atlas.iloc[:, [0, 4]], on='electrode_name')
    # Remove electrodes in the Functional Connectivity matrices that have a region of 0
    final_electrodes = final_electrodes[final_electrodes['region_number'] != 0]
    # now join in the classification region number
    final_electrodes = final_electrodes.merge(
        electrode_localization_by_class_atlas.iloc[:, [0, 4]],
        on='electrode_name')
    for perm in range(0, 2):
        FC_list = FC_list_global.copy()
        structural_connectivity_array = structural_connectivity_array_global.copy(
        )
        if (perm == 0):
            #we will first compute electrodes that are inside the classification atlas
            # grey matter
            final_electrodes_cur = final_electrodes[final_electrodes.iloc[:, 3]
                                                    == 0]
            # adjust the output dir
            outputfile_adj = outputfile + '_inside_correlation.pickle'
        else:
            #we will next compute electrodes that are outside the classfiication atlas
            # white matter
            final_electrodes_cur = final_electrodes[
                final_electrodes.iloc[:, 3] > 0]
            # adjust the output dir
            outputfile_adj = outputfile + '_outside_correlation.pickle'
        for i in range(len(FC_list)):
            FC_list[i] = FC_list[i][final_electrodes_cur['func_index'], :, :]
            FC_list[i] = FC_list[i][:, final_electrodes_cur['func_index'], :]

        #Fisher z-transform of functional connectivity data. This is to take means of correlations and do correlations to the structural connectivity
        #Fisher z transform is just arctanh
        for i in range(len(FC_list)):
            FC_list[i] = np.arctanh(FC_list[i])

        # Remove structural ROIs not in electrode_localization ROIs
        electrode_ROIs = np.unique(np.array(final_electrodes_cur.iloc[:, 2]))
        electrode_ROIs = electrode_ROIs[~(electrode_ROIs
                                          == 0)]  #remove region 0
        structural_index = electrode_ROIs - 1  #subtract 1 because of python's zero indexing
        structural_connectivity_array = structural_connectivity_array[
            structural_index, :]
        structural_connectivity_array = structural_connectivity_array[:,
                                                                      structural_index]

        #taking average functional connectivity for those electrodes in same atlas regions
        for i in range(len(FC_list)):
            ROIs = np.array(final_electrodes_cur.iloc[:, 2])
            for r in range(len(electrode_ROIs)):
                index_logical = (ROIs == electrode_ROIs[r])
                index_first = np.where(index_logical)[0][0]
                index_second_to_end = np.where(index_logical)[0][1:]
                mean = np.mean(FC_list[i][index_logical, :, :], axis=0)
                # Fill in with mean.
                FC_list[i][index_first, :, :] = mean
                FC_list[i][:, index_first, :] = mean
                #delete the other rows and oclumns belonging to same region.
                FC_list[i] = np.delete(FC_list[i], index_second_to_end, axis=0)
                FC_list[i] = np.delete(FC_list[i], index_second_to_end, axis=1)
                #keeping track of which electrode labels correspond to which rows and columns
                ROIs = np.delete(ROIs, index_second_to_end, axis=0)
            #remove electrodes in the ROI labeld as zero
            index_logical = (ROIs == 0)
            index = np.where(index_logical)[0]
            FC_list[i] = np.delete(FC_list[i], index, axis=0)
            FC_list[i] = np.delete(FC_list[i], index, axis=1)
            ROIs = np.delete(ROIs, index, axis=0)

        #order FC matrices by ROIs
        order = np.argsort(ROIs)
        for i in range(len(FC_list)):
            FC_list[i] = FC_list[i][order, :, :]
            FC_list[i] = FC_list[i][:, order, :]

        #un-fisher ztranform
        for i in range(len(FC_list)):
            FC_list[i] = np.tanh(FC_list[i])

        #initialize correlation arrays
        Corrrelation_list = [None] * len(FC_list)
        for i in range(len(FC_list)):
            Corrrelation_list[i] = np.zeros([FC_list[0].shape[2]], dtype=float)

        correlation_type = 'spearman'
        #calculate Structure-Function Correlation.
        for i in range(len(FC_list)):
            for t in range(FC_list[i].shape[2] - 1):
                #Spearman Rank Correlation: functional connectivity and structural connectivity are non-normally distributed. So we should use spearman
                if correlation_type == 'spearman':
                    Corrrelation_list[i][t] = spearmanr(
                        np.ndarray.flatten(FC_list[i][:, :, t]),
                        np.ndarray.flatten(
                            structural_connectivity_array)).correlation
                    #print("spearman")
                # Pearson Correlation: This is calculated bc past studies use Pearson Correlation and we want to see if these results are comparable.
                if correlation_type == 'pearson':
                    Corrrelation_list[i][t] = pearsonr(
                        np.ndarray.flatten(FC_list[i][:, :, t]),
                        np.ndarray.flatten(structural_connectivity_array))[0]

        order_of_matrices_in_pickle_file = pd.DataFrame(
            ["broadband", "alphatheta", "beta", "lowgamma", "highgamma"],
            columns=["Order of matrices in pickle file"])
        with open(outputfile_adj, 'wb') as f:
            pickle.dump([
                Corrrelation_list[0], Corrrelation_list[1],
                Corrrelation_list[2], Corrrelation_list[3],
                Corrrelation_list[4], order_of_matrices_in_pickle_file
            ], f)
def SFC_for_null_model(structure_file_path, FC_list,
                       electrode_row_and_column_names,
                       electrode_localization_by_atlas_file_path):
    """
    :param structure_file_path:
    :param function_file_path:
    :param electrode_localization_by_atlas_file_path:
    :return:
    """
    """
    
    #Example:

    sub_ID='RID0309'
    iEEG_filename="HUP151_phaseII"
    start_times_array=[494702000000]
    stop_times_array=[494776000000]

    atlas_folder = 'RA_N0100'
    perm = 1
    structure_file_path= '/Users/andyrevell/mount/DATA/Human_Data/BIDS_processed/sub-{0}/connectivity_matrices/structural/{1}/sub-{0}_ses-preop3T_dwi-eddyMotionB0Corrected.nii.gz.trk.gz.{1}_Perm{2}.count.pass.connectivity.mat'.format(sub_ID,atlas_folder,'{:04}'.format(perm))
    electrode_localization_by_atlas_file_path = '/Users/andyrevell/mount/DATA/Human_Data/BIDS_processed/sub-{0}/electrode_localization/electrode_localization_by_atlas/sub-{0}_electrode_coordinates_mni_{1}_Perm{2}.csv'.format(sub_ID,atlas_folder,'{:04}'.format(perm))

    function_file_path = '/Users/andyrevell/mount/DATA/Human_Data/BIDS_processed/sub-{0}/connectivity_matrices/functional/eeg/sub-{0}_{1}_{2}_{3}_functionalConnectivity.pickle'.format(sub_ID,iEEG_filename,start_times_array[0],stop_times_array[0])

    #Output Files:
    outputfile = '/Users/andyrevell/mount/DATA/Human_Data/BIDS_processed/sub-{0}/connectivity_matrices/structure_function_correlation/{1}/sub-{0}_{2}_{3}_{4}_{1}_Perm{5}_correlation.pickle'.format(sub_ID,atlas_folder, iEEG_filename, start_times_array[0],stop_times_array[0],'{:04}'.format(perm))

    """

    # set up the dataframe of electrodes to analyze
    final_electrodes = pd.DataFrame(electrode_row_and_column_names,
                                    columns=['electrode_name'])
    final_electrodes = final_electrodes.reset_index()
    final_electrodes = final_electrodes.rename(columns={"index": "func_index"})

    #Get Structural Connectivity data in mat file format. Output from DSI studio
    structural_connectivity_array = np.array(
        pd.DataFrame(loadmat(structure_file_path)['connectivity']))

    #Get electrode localization by atlas csv file data. From get_electrode_localization.py
    electrode_localization_by_atlas = pd.read_csv(
        electrode_localization_by_atlas_file_path)

    # normalizing and log-scaling the structural matrices
    structural_connectivity_array[structural_connectivity_array == 0] = 1
    structural_connectivity_array = np.log10(
        structural_connectivity_array
    )  # log-scaling. Converting 0s to 1 to avoid taking log of zeros
    structural_connectivity_array = structural_connectivity_array / np.max(
        structural_connectivity_array)  # normalization

    #Only consider electrodes that are in both the localization and the pickle file
    final_electrodes = final_electrodes.merge(
        electrode_localization_by_atlas.iloc[:, [0, 4]], on='electrode_name')
    # Remove electrodes in the Functional Connectivity matrices that have a region of 0
    final_electrodes = final_electrodes[final_electrodes['region_number'] != 0]
    for i in range(len(FC_list)):
        FC_list[i] = FC_list[i][final_electrodes['func_index'], :, :]
        FC_list[i] = FC_list[i][:, final_electrodes['func_index'], :]

    #Fisher z-transform of functional connectivity data. This is to take means of correlations and do correlations to the structural connectivity
    #Fisher z transform is just arctanh
    for i in range(len(FC_list)):
        FC_list[i] = np.arctanh(FC_list[i])

    # Remove structural ROIs not in electrode_localization ROIs
    electrode_ROIs = np.unique(np.array(final_electrodes.iloc[:, 2]))
    electrode_ROIs = electrode_ROIs[~(electrode_ROIs == 0)]  #remove region 0
    structural_index = electrode_ROIs - 1  #subtract 1 because of python's zero indexing
    structural_connectivity_array = structural_connectivity_array[
        structural_index, :]
    structural_connectivity_array = structural_connectivity_array[:,
                                                                  structural_index]

    #taking average functional connectivity for those electrodes in same atlas regions
    for i in range(len(FC_list)):
        ROIs = np.array(final_electrodes.iloc[:, 2])
        for r in range(len(electrode_ROIs)):
            index_logical = (ROIs == electrode_ROIs[r])
            index_first = np.where(index_logical)[0][0]
            index_second_to_end = np.where(index_logical)[0][1:]
            mean = np.mean(FC_list[i][index_logical, :, :], axis=0)
            # Fill in with mean.
            FC_list[i][index_first, :, :] = mean
            FC_list[i][:, index_first, :] = mean
            #delete the other rows and oclumns belonging to same region.
            FC_list[i] = np.delete(FC_list[i], index_second_to_end, axis=0)
            FC_list[i] = np.delete(FC_list[i], index_second_to_end, axis=1)
            #keeping track of which electrode labels correspond to which rows and columns
            ROIs = np.delete(ROIs, index_second_to_end, axis=0)
        #remove electrodes in the ROI labeld as zero
        index_logical = (ROIs == 0)
        index = np.where(index_logical)[0]
        FC_list[i] = np.delete(FC_list[i], index, axis=0)
        FC_list[i] = np.delete(FC_list[i], index, axis=1)
        ROIs = np.delete(ROIs, index, axis=0)

    #order FC matrices by ROIs
    order = np.argsort(ROIs)
    for i in range(len(FC_list)):
        FC_list[i] = FC_list[i][order, :, :]
        FC_list[i] = FC_list[i][:, order, :]

    #un-fisher ztranform
    for i in range(len(FC_list)):
        FC_list[i] = np.tanh(FC_list[i])

    #initialize correlation arrays
    Corrrelation_list = [None] * len(FC_list)
    for i in range(len(FC_list)):
        Corrrelation_list[i] = np.zeros([FC_list[0].shape[2]], dtype=float)

    correlation_type = 'spearman'
    #calculate Structure-Function Correlation.
    for i in range(len(FC_list)):
        for t in range(FC_list[i].shape[2] - 1):
            #Spearman Rank Correlation: functional connectivity and structural connectivity are non-normally distributed. So we should use spearman
            if correlation_type == 'spearman':
                Corrrelation_list[i][t] = spearmanr(
                    np.ndarray.flatten(FC_list[i][:, :, t]),
                    np.ndarray.flatten(
                        structural_connectivity_array)).correlation
                #print("spearman")
            # Pearson Correlation: This is calculated bc past studies use Pearson Correlation and we want to see if these results are comparable.
            if correlation_type == 'pearson':
                Corrrelation_list[i][t] = pearsonr(
                    np.ndarray.flatten(FC_list[i][:, :, t]),
                    np.ndarray.flatten(structural_connectivity_array))[0]

    return (Corrrelation_list)
def plot_corr_heatmap(df,
                      color_threshold=0.6,
                      cmap=None,
                      figsize=None,
                      value_fontsize=8,
                      label_fontsize=9,
                      precision=2,
                      xrot=80):
    """
    Display the feature spearman's correlation matrix as a heatmap with
    any abs(value)>color_threshold appearing with background color.

    Spearman's correlation is the same thing as converting two variables
    to rank values and then running a standard Pearson's correlation
    on those ranked variables. Spearman's is nonparametric and does not
    assume a linear relationship between the variables; it looks for
    monotonic relationships.

    SAMPLE CODE

    from rfpimp import plot_corr_heatmap
    viz = plot_corr_heatmap(df_train, save='/tmp/corrheatmap.svg',
                      figsize=(7,5), label_fontsize=13, value_fontsize=11)
    viz.view() # or just viz in notebook
    """
    corr = spearmanr(df).correlation
    if len(corr.shape) == 0:
        corr = np.array([[1.0, corr], [corr, 1.0]])

    filtered = copy(corr)
    filtered = np.abs(filtered)  # work with abs but display negatives later
    mask = np.ones_like(corr)
    filtered[np.tril_indices_from(mask)] = -9999

    if cmap is None:
        cw = plt.get_cmap('coolwarm')
        cmap = ListedColormap(
            [cw(x) for x in np.arange(color_threshold, .85, 0.01)])
    elif isinstance(cmap, str):
        cmap = plt.get_cmap(cmap)
    cm = copy(cmap)
    cm.set_under(color='white')

    if figsize:
        plt.figure(figsize=figsize)
    im = plt.imshow(filtered,
                    cmap=cm,
                    vmin=color_threshold,
                    vmax=1,
                    aspect='equal')

    width, height = filtered.shape
    for x in range(width):
        for y in range(height):
            if x == y:
                plt.annotate('x',
                             xy=(y, x),
                             horizontalalignment='center',
                             verticalalignment='center',
                             fontsize=value_fontsize,
                             color=GREY)
            if x < y:
                plt.annotate(myround(corr[x, y], precision),
                             xy=(y, x),
                             horizontalalignment='center',
                             verticalalignment='center',
                             fontsize=value_fontsize,
                             color=GREY)

    cb = plt.colorbar(im,
                      fraction=0.046,
                      pad=0.04,
                      ticks=[
                          color_threshold,
                          color_threshold + (1 - color_threshold) / 2, 1.0
                      ])
    cb.ax.tick_params(
        labelsize=label_fontsize,
        labelcolor=GREY,
    )
    cb.outline.set_edgecolor('white')
    plt.xticks(range(width),
               df.columns,
               rotation=xrot,
               horizontalalignment='right',
               fontsize=label_fontsize,
               color=GREY)
    plt.yticks(range(width),
               df.columns,
               verticalalignment='center',
               fontsize=label_fontsize,
               color=GREY)

    ax = plt.gca()
    ax.spines['top'].set_linewidth(.3)
    ax.spines['right'].set_linewidth(.3)
    ax.spines['left'].set_linewidth(.3)
    ax.spines['bottom'].set_linewidth(.3)

    plt.tight_layout()
    return PimpViz()
Example #12
0
def predictedToActualSimilarity(predictions, actual, labels):
    """
    A cross-validated predicted-to-actual similarity analysis; serves to quantify how much information has transferred
    Similar to a cross-validated RSA

    Parameters:
        predictions - predicted activation patterns. A sample X features matrix.
        actual - actual activation patterns to compare predictions against. A sample X features matrix.
        labels - a label matrix organized as a 32 (samples) x 4 (conditions) matrix; columns indicate task condition (or task-rule); rows specify the miniblock index 

    Returns 
        ite_mean - The average information transfer estimate for a given prediction, averaged across all miniblocks.
    """

    nrules = labels.shape[1]  # number of task conditions
    ncvs = labels.shape[
        0]  # number of cross-validations; a leave-four-out cross validation in the case of the manuscript

    correct_matches = []
    incorrect_matches = []
    # Running cross-validation. Hold out one sample of each condition (leave-four-out) in each cross validation
    for cv in range(ncvs):

        # Obtain the *real* prototypes for each of the rules, but leave out the current trial (cv value)
        testset_ind = labels[cv, :]
        trainset_ind = np.delete(
            labels, cv, axis=0)  # Delete the test set row from the train set
        corr = []
        err = []
        for cond1 in np.arange(nrules, dtype=int):
            # Find the miniblock we're comparing
            testmb = testset_ind[cond1]
            predicted_miniblock = predictions[testmb, :]

            # predicted-to-actual similarity
            for cond2 in np.arange(nrules, dtype=int):
                # Obtain specific miniblocks pertaining to cond2 condition
                trainmb = trainset_ind[:, cond2]
                trainmb = trainmb.astype('int')
                actualprototype = np.mean(
                    actual[trainmb, :], axis=0
                )  # average across training samples to obtain prototype

                # If condition matches
                if cond1 == cond2:
                    corr.append(
                        np.arctanh(
                            stats.spearmanr(predicted_miniblock,
                                            actualprototype)[0]))
                else:
                    err.append(
                        np.arctanh(
                            stats.spearmanr(predicted_miniblock,
                                            actualprototype)[0]))

        # Get average matches for this cross-validation fold
        correct_matches.append(np.mean(corr))
        # Get average mismatches for this cross-validation fold
        incorrect_matches.append(np.mean(err))

    ite_mean = np.mean(correct_matches) - np.mean(incorrect_matches)
    return ite_mean
Example #13
0
# =============================================================================
# transform
# =============================================================================
gsem = gsem.fillna(method='ffill')
gsem['log_return'] = np.log(gsem['Adj Close'] / gsem['Adj Close'].shift(1))
gscef = gscef.fillna(method='ffill')
gscef['log_return'] = np.log(gscef['Adj Close'] / gscef['Adj Close'].shift(1))

# =============================================================================
# OLS Regression
# =============================================================================
#df = pd.DataFrame({'A': gsem['log_return'], 'B': gscef['log_return']})
#result = sm.ols(formula = 'A ~ B', data = df).fit()
#print(result.summary())
# correlation
stats.spearmanr(gsem['Adj Close'], gscef['Adj Close'])
print(np.corrcoef(gsem['Adj Close'], gscef['Adj Close']))

# =============================================================================
## plotting
## =============================================================================
#plt.plot(gsem['log_return'], gscef['log_return'], 'r.')
#ax = plt.axis() # grab x-axis values
#x = np.linspace(ax[0], ax[1] + 0.01)
#plt.plot(x, -0.0003 + x * 0.9303, 'b')
#plt.grid(True)
#plt.xlabel('Goldman Sachs Emerging Market Index')
#plt.ylabel('Goldman Sachs China Equity Fund')
#plt.title('Scatter of log returns and regresson line')

#upper = plt.subplot(2, 1, 1)
Example #14
0
def compute_spearman(predicts, labels):
    if len(predicts) >= 2:
        scof = spearmanr(labels, predicts)[0]
        return 100.0 * scof
    else:
        return np.nan
Example #15
0
    if m_match_flag[i] > -1:
        m_match_bin_count[m_mass_bin_index[i], m_z_bin_index[i]] += 1               
               
complete_bin_mass = np.flipud(np.array(m_match_bin_count).astype("float") / np.array(m_bin_count2).astype("float"))
     
################
# SPEARMAN RHO #
################
   
for i in range(n_z_bins):
    m_index_1 = mock.rich[match_index][m_z_bin_index[match_index] == i]
    m_index_2 = mock.rich[match_index][c_z_bin_index[m_match_flag[match_index]] == i]
    c_index_1 = cluster.rich[m_match_flag[match_index]][m_z_bin_index[match_index] == i]
    c_index_2 = cluster.rich[m_match_flag[match_index]][c_z_bin_index[m_match_flag[match_index]] == i]
    if m_index_1.size > 1:
        m_rho, p1 = ss.spearmanr(m_index_1, c_index_1)
        m_rho_err = 0.6325 / (len(m_index_1) - 1) ** 0.5
    else:
        m_rho = 0.0
        m_rho_err = 0.0
    if m_index_2.size > 1:
        c_rho, p2 = ss.spearmanr(m_index_2, c_index_2)
        c_rho_err = 0.6325 / (len(m_index_2) - 1) ** 0.5
    else:
        c_rho = 0.0
        c_rho_err = 0.0
    if i == 0:
        m_rhos = np.array(m_rho)
        m_rhos_err = np.array(m_rho_err)
        c_rhos = np.array(c_rho)
        c_rhos_err = np.array(c_rho_err)
Example #16
0
from statsmodels.stats import multitest
import funs_fi as fi

dir_base = os.getcwd()
dir_output = os.path.join(dir_base, 'output')

################################################
# ----------- (1) LOAD IN THE DATA ----------- #

tmp_FI = pd.read_csv(os.path.join('processed', 'df_FI.csv'))
tmp_inf = pd.read_csv(os.path.join('processed', 'df_inf.csv'))
tmp_res = pd.read_csv(os.path.join('processed', 'df_res.csv'))
df = tmp_inf.merge(tmp_FI, on=['tt', 'ID']).merge(tmp_res, on=['tt', 'ID'])
del tmp_FI, tmp_inf, tmp_res

##################################################
# ----------- (2) REVERSE FI RESULTS ----------- #

df_neg = df[df.tt == 'neg'].reset_index(drop=True)
df_neg['FQ'] = df_neg.FI / (df_neg.num1 + df_neg.num2)

print(np.round(df_neg.FI.describe(), 1))
print(np.round(df_neg.FQ.describe(), 2))

rho_IF = stats.spearmanr(df_neg.FI, df_neg.IF)
print('Rho: %0.3f (p-val: %0.3f) for IF-FI' % (rho_IF[0], rho_IF[1]))

thresh = 12
print('A total of %i studies have a RFI>%i and %i have <=%i' %
      (sum(df_neg.FI > thresh), thresh, sum(df_neg.FI <= thresh), thresh))
Example #17
0
cars_data = pd.read_csv("mtcars.csv")
cars_data.columns = ['car_names', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs' , 'am','gear' ,'carb']

cars_data.head()

cars_sub = cars_data.iloc[:, [2, 4]].values
cars_data_names = ['cyl', 'hp']

y = cars_data.iloc[:, 9].values

sb.regplot(x='cyl', y='hp', data=cars_data, scatter=True)

cyl = cars_data['cyl']
hp = cars_data['hp']

spearmanr_coefficient, p_value = spearmanr(cyl, hp)

sb.countplot(x='am', data=cars_data, palette="hls")


X = scale(cars_sub)
LogReg = LogisticRegression()

LogReg.fit(X, y)

print(LogReg.score(X, y))

'''Predictors'''
y_pred = LogReg.predict(X)

print(classification_report(y, y_pred))
Example #18
0
    def eval(self, splt):
        """
        Evaluate on XNLI validation and test sets, for all languages.
        """
        params = self.params
        self.embedder.eval()
        self.proj.eval()

        scores = OrderedDict({'epoch': self.epoch})
        task = self.task.lower()

        pred = []  # predicted values
        gold = []  # real values

        lang_id = params.lang2id['en']

        for batch in self.get_iterator(splt):

            # batch
            if self.n_sent == 1:
                (x, lengths), idx = batch
                # x, lengths = truncate(x, lengths, params.max_len, params.eos_index)
            else:
                (sent1, len1), (sent2, len2), idx = batch
                # sent1, len1 = truncate(sent1, len1, params.max_len, params.eos_index)
                # sent2, len2 = truncate(sent2, len2, params.max_len, params.eos_index)
                x, lengths, _, _ = concat_batches(sent1,
                                                  len1,
                                                  lang_id,
                                                  sent2,
                                                  len2,
                                                  lang_id,
                                                  params.pad_index,
                                                  params.eos_index,
                                                  reset_positions=False)
            y = self.data[splt]['y'][idx]

            # cuda
            x, y, lengths = to_cuda(x, y, lengths)

            # prediction
            output = self.proj(
                self.embedder.get_embeddings(x,
                                             lengths,
                                             positions=None,
                                             langs=None))
            p = output.data.max(1)[1] if self.is_classif else output.squeeze(1)
            pred.append(p.cpu().numpy())
            gold.append(y.cpu().numpy())

        gold = np.concatenate(gold)
        pred = np.concatenate(pred)

        if self.is_classif:
            scores['%s_valid_acc' %
                   task] = 100. * (pred == gold).sum() / len(pred)
            scores['%s_valid_f1' % task] = 100. * f1_score(
                gold,
                pred,
                average='binary' if params.out_features == 2 else 'micro')
            scores['%s_valid_mc' % task] = 100. * matthews_corrcoef(gold, pred)
        else:
            scores['%s_valid_prs' % task] = 100. * pearsonr(pred, gold)[0]
            scores['%s_valid_spr' % task] = 100. * spearmanr(pred, gold)[0]

        logger.info("__log__:%s" % json.dumps(scores))
        return scores
Example #19
0
    posthoc_tests['posthoc_' + str(var)] = posthoc
    stats_tests.loc[i, 'variable'] = var
    stats_tests.loc[i, 'test_type'] = test_type
    stats_tests.loc[i, 'p_value'] = test[1]
    stats_tests.loc[i, 'p_value_variance'] = p_var

# Correct for multiple tests
stats_tests['p_value'] = multipletests(stats_tests['p_value'],
                                       method='fdr_bh')[1]
stats_tests['p_value_variance'] = multipletests(
    stats_tests['p_value_variance'], method='fdr_bh')[1]

if (stats.normaltest(learned['n_trials'])[1] < 0.05
        or stats.normaltest(learned['reaction_time'])[1] < 0.05):
    test_type = 'spearman'
    correlation_coef, correlation_p = stats.spearmanr(learned['reaction_time'],
                                                      learned['n_trials'])
if (stats.normaltest(learned['n_trials'])[1] > 0.05
        and stats.normaltest(learned['reaction_time'])[1] > 0.05):
    test_type = 'pearson'
    correlation_coef, correlation_p = stats.pearsonr(learned['reaction_time'],
                                                     learned['n_trials'])

# Add all mice to dataframe seperately for plotting
learned_no_all = learned.copy()
learned_no_all.loc[learned_no_all.shape[0] + 1, 'lab_number'] = 'All'
learned_2 = learned.copy()
learned_2['lab_number'] = 'All'
learned_2 = learned.append(learned_2)

# %%
seaborn_style()
Example #20
0
    im = np.asarray(cv2.imread(directory))
    for j in range(Num_Patch):
        x = im.shape[0]
        y = im.shape[1]
        x_p = np.random.randint(x - 128, size=1)[0]
        y_p = np.random.randint(y - 128, size=1)[0]

        temp = im[x_p:x_p + 128, y_p:y_p + 128, :].transpose([2, 0, 1])

        out = net.forward_all(data=np.asarray([temp]))

        feat[i, j] = out[ft][0]
        pre[i] += out[ft][0]
    pre[i] /= Num_Patch
    med[i] = np.median(feat[i, :])

srocc = stats.spearmanr(pre, scores)[0]
lcc = stats.pearsonr(pre, scores)[0]
print '%   LCC of mean : {}'.format(lcc)
print '% SROCC of mean: {}'.format(srocc)

srocc_file.write('%6.3f\n' % (srocc))
lcc_file.write('%6.3f\n' % (lcc))
srocc_file.close()
lcc_file.close()

srocc = stats.spearmanr(med, scores)[0]
lcc = stats.pearsonr(med, scores)[0]
print '%   LCC of median: {}'.format(lcc)
print '% SROCC of median: {}'.format(srocc)
Example #21
0
model = None
best_model = load_model('BLSTM_pretrained_FE.hdf5')

Recon_Spear_Intensity = []
Recon_Spear_Pitch = []
for i in range(len(Test_clean_audio)):
    # Spec Feature
    clean_Spec, _ = Sp_and_Phase(Test_clean_audio[i], Noisy=False)
    # Praat Feature
    clean_Prosodic = Prosodic_feat_process(Test_clean_prosodic[i],
                                           Normalize=False)
    clean_Prosodic = Exten_prosodic_feat(clean_Spec, clean_Prosodic)
    # model prediction
    pred_Prosodic = best_model.predict(clean_Spec)
    # Spearman corr. evaluation metric
    p_spear_corr = spearmanr(clean_Prosodic[:, :, 0].reshape(-1),
                             pred_Prosodic[:, :, 0].reshape(-1))
    Recon_Spear_Pitch.append(p_spear_corr)
    e_spear_corr = spearmanr(clean_Prosodic[:, :, 1].reshape(-1),
                             pred_Prosodic[:, :, 1].reshape(-1))
    Recon_Spear_Intensity.append(e_spear_corr)

    # Plot Reconstruction Results
    plt.rc('font', family='Times New Roman')
    plt.plot(clean_Prosodic[:, :, 0].reshape(-1), color='blue', linewidth=3.5)
    plt.plot(pred_Prosodic[:, :, 0].reshape(-1), color='red', linewidth=3.5)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    plt.show()

print('Avg. Reconstruct Intensity SpearCorr: ' +
      str(np.mean(Recon_Spear_Intensity, axis=0)[0]))