Ejemplo n.º 1
0
def run_QTL_analysis(pheno_filename,
                     anno_filename,
                     geno_prefix,
                     plinkGenotype,
                     output_dir,
                     window_size=250000,
                     min_maf=0.05,
                     min_hwe_P=0.001,
                     min_call_rate=0.95,
                     blocksize=1000,
                     cis_mode=True,
                     skipAutosomeFiltering=False,
                     gaussianize_method=None,
                     minimum_test_samples=10,
                     seed=np.random.randint(40000),
                     n_perm=0,
                     write_permutations=False,
                     relatedness_score=0.95,
                     feature_variant_covariate_filename=None,
                     snps_filename=None,
                     feature_filename=None,
                     snp_feature_filename=None,
                     genetic_range='all',
                     covariates_filename=None,
                     kinship_filename=None,
                     sample_mapping_filename=None,
                     extended_anno_filename=None,
                     regressCovariatesUpfront=False):
    fill_NaN = Imputer(missing_values=np.nan,
                       strategy='mean',
                       axis=0,
                       copy=False)
    print('Running QTL analysis.')
    lik = 'normal'
    minimumProbabilityStep = 0.1
    '''Core function to take input and run QTL tests on a given chromosome.'''
    if relatedness_score is not None:
        relatedness_score = float(relatedness_score)
    [phenotype_df, kinship_df, covariate_df, sample2individual_df,complete_annotation_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, bim, fam, bed, bgen, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\
    utils.run_QTL_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename, anno_filename=anno_filename, geno_prefix=geno_prefix, plinkGenotype=plinkGenotype, cis_mode=cis_mode, skipAutosomeFiltering = skipAutosomeFiltering,
                      minimum_test_samples= minimum_test_samples,  relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename, snp_feature_filename=snp_feature_filename, selection=genetic_range,
                     covariates_filename=covariates_filename, kinship_filename=kinship_filename, sample_mapping_filename=sample_mapping_filename, extended_anno_filename=extended_anno_filename, feature_variant_covariate_filename=feature_variant_covariate_filename)

    mixed = kinship_df is not None
    if (kinship_df is None) or (relatedness_score is None):
        geneticaly_unique_individuals = sample2individual_df['iid'].values
    QS = None
    if (feature_list == None or len(feature_list) == 0):
        print('No features to be tested.')
        sys.exit()

    #Open output files
    qtl_loader_utils.ensure_dir(output_dir)
    if not selectionStart is None:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
    else:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}.h5'.format(chromosome))
    if (write_permutations):
        if not selectionStart is None:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}_{}_{}.h5'.format(
                    chromosome, selectionStart, selectionEnd), n_perm)
        else:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}.h5'.format(chromosome), n_perm)

    #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness.
    tested_snp_ids = []
    pass_qc_snps_all = []
    fail_qc_snps_all = []
    fail_qc_features = []
    alpha_params = []
    beta_params = []
    n_samples = []
    n_e_samples = []
    na_containing_features = 0
    currentFeatureNumber = 0
    snpQcInfoMain = None

    for feature_id in feature_list:
        snpQcInfo = None
        currentFeatureNumber += 1
        if (len(phenotype_df.loc[feature_id, :])) < minimum_test_samples:
            print("Feature: " + feature_id +
                  " not tested not enough samples do QTL test.")
            fail_qc_features.append(feature_id)
            geneticaly_unique_individuals = tmp_unique_individuals
            continue
        data_written = False
        contains_missing_samples = False
        snpQuery = utils.do_snp_selection(feature_id, complete_annotation_df,
                                          bim, cis_mode, window_size,
                                          skipAutosomeFiltering)
        snp_cov_df = None
        if (feature_variant_covariate_df is not None):
            if (feature_id in feature_variant_covariate_df['feature'].values):
                covariateSnp = feature_variant_covariate_df['snp_id'].values[
                    feature_variant_covariate_df['feature'] == feature_id]
                if (any(i in bim['snp'].values for i in covariateSnp)):
                    snpQuery_cov = bim.loc[
                        bim['snp'].map(lambda x: x in list(covariateSnp)), :]
                    if (plinkGenotype):
                        snp_cov_df = pd.DataFrame(
                            data=bed[snpQuery_cov['i'].values, :].compute().
                            transpose(),
                            index=fam.index,
                            columns=snpQuery_cov['snp'],
                        )
                    else:
                        ##Here we make some assumptions on the SNPs. They are expected to be ploidy 2!
                        ##Also we don't use a minimal quality to assure a value is present for all samples.
                        print(
                            'Warning, during the regression of SNPs we assume ploidy 2.'
                        )
                        snp_cov_df_t = pd.DataFrame(columns=fam.index)
                        rowNumber = 0
                        for snpId in snpQuery_cov['i']:
                            geno = bgen["genotype"][snpId].compute()
                            if (geno["phased"]):
                                snp_df_dosage_t = geno["probs"][:, [0, 2]].sum(
                                    1).astype(float)
                                snp_df_dosage_t[(
                                    np.amax(geno["probs"][:, :2], 1) +
                                    np.amax(geno["probs"][:, 2:4], 1)) < (
                                        1 +
                                        minimumProbabilityStep)] = float('NaN')
                            else:
                                snp_df_dosage_t = (geno["probs"][:, 0] *
                                                   2) + geno["probs"][:, 1]
                                snp_df_dosage_t[
                                    np.amax(geno["probs"][:, :3], 1) < (
                                        (1 / 3) +
                                        minimumProbabilityStep)] = float('NaN')
                            snp_df_dosage_t = pd.Series(snp_df_dosage_t,
                                                        index=fam.index)
                            snp_df_dosage_t.name = snpId
                            snp_cov_df_t = snp_cov_df_t.append(snp_df_dosage_t)
                            rowNumber = rowNumber + 1
                        snp_cov_df_t = snp_cov_df_t.transpose()

        if (len(snpQuery) != 0) and (snp_filter_df is not None):
            toSelect = set(snp_filter_df.index).intersection(
                set(snpQuery['snp']))
            snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)]

        if (len(snpQuery) != 0) and (snp_feature_filter_df is not None):
            toSelect = set(
                np.unique(snp_feature_filter_df['snp_id'].loc[
                    snp_feature_filter_df['feature'] ==
                    feature_id])).intersection(set(snpQuery['snp']))
            snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)]

        if len(snpQuery) == 0:
            print("Feature: " + feature_id +
                  " not tested. No SNPS passed QC for phenotype.")
            fail_qc_features.append(feature_id)
            continue
        else:
            phenotype_ds = phenotype_df.loc[feature_id]
            contains_missing_samples = any(~np.isfinite(phenotype_ds))
            if (contains_missing_samples):
                print('Feature: ' + feature_id + ' contains missing data.')
                phenotype_ds.dropna(inplace=True)
                na_containing_features = na_containing_features + 1
            '''select indices for relevant individuals in genotype matrix
            These are not unique. NOT to be used to access phenotype/covariates data
            '''

            individual_ids = sample2individual_df.loc[phenotype_ds.index,
                                                      'iid'].values
            sample2individual_feature = sample2individual_df.loc[
                phenotype_ds.index]

            if (contains_missing_samples):
                tmp_unique_individuals = geneticaly_unique_individuals
                if (kinship_df is not None) and (relatedness_score
                                                 is not None):
                    geneticaly_unique_individuals = utils.get_unique_genetic_samples(
                        kinship_df.loc[individual_ids, individual_ids],
                        relatedness_score)
                else:
                    geneticaly_unique_individuals = individual_ids
            else:
                #If no missing samples we can use the previous SNP Qc information before actually loading data.
                #This allows for more efficient blocking and retrieving of data
                snpQuery = snpQuery.loc[snpQuery['snp'].map(
                    lambda x: x not in list(map(str, fail_qc_snps_all)))]

            if phenotype_ds.empty or len(
                    geneticaly_unique_individuals) < minimum_test_samples:
                print("Feature: " + feature_id +
                      " not tested not enough samples do QTL test.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue
            elif np.var(phenotype_ds.values) == 0:
                print("Feature: " + feature_id +
                      " has no variance in selected individuals.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue

            print('For feature: ' + str(currentFeatureNumber) + '/' +
                  str(len(feature_list)) + ' (' + feature_id + '): ' +
                  str(snpQuery.shape[0]) +
                  ' SNPs need to be tested.\n Please stand by.')

            if (n_perm != 0):
                bestPermutationPval = np.ones((n_perm), dtype=np.float)

            #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix.
            #test if the covariates, kinship, snp and phenotype are in the same order
            if ((all(kinship_df.loc[individual_ids,individual_ids].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\
                 (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index)if covariate_df is not None else True)):
                '''
                if all lines are in order put in arrays the correct genotype and phenotype
                x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b;                 better readability of the code
                 '''
                if kinship_df is not None:
                    kinship_mat = kinship_df.loc[individual_ids,
                                                 individual_ids].values
                    kinship_mat = kinship_mat.astype(float)
                    ##GOWER normalization of Kinship matrix.
                    kinship_mat *= (kinship_mat.shape[0] - 1) / (
                        kinship_mat.trace() - kinship_mat.mean(0).sum())
                    ## This needs to go with the subselection stuff.
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(kinship_mat)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(kinship_mat)
                if kinship_df is None:
                    K = np.eye(len(phenotype_ds.index))
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(K)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(K)
                cov_matrix = covariate_df.loc[sample2individual_feature[
                    'sample'], :].values if covariate_df is not None else None
                if covariate_df is None:
                    cov_matrix = np.ones((len(individual_ids), 1))
                if snp_cov_df is not None:
                    snp_cov_df_tmp = snp_cov_df.loc[individual_ids, :]
                    snp_cov_df_tmp.index = sample2individual_feature['sample']
                    snp_cov_df = pd.DataFrame(
                        fill_NaN.fit_transform(snp_cov_df_tmp))
                    snp_cov_df.index = snp_cov_df_tmp.index
                    snp_cov_df.columns = snp_cov_df_tmp.columns
                    cov_matrix = np.concatenate(
                        (cov_matrix, snp_cov_df.values), 1)
                    snp_cov_df_tmp = None
                    snp_cov_df = None
                cov_matrix = cov_matrix.astype(float)
            else:
                print(
                    'There is an issue in mapping phenotypes vs covariates and/or kinship'
                )
                sys.exit()

            phenotype = utils.force_normal_distribution(
                phenotype_ds.values, method=gaussianize_method
            ) if gaussianize_method is not None else phenotype_ds.values

            #Prepare LMM
            phenotype = phenotype.astype(float)

            ##Mixed and test.
            ##This is a future change so we don't need to decompose the COVs every time.
            ##Like QS this needs to happen when genetic unique individuals is the same.
            #svd_cov = economic_svd(cov_matrix)
            #lmm = LMM(phenotype, cov_matrix, QS, SVD=svd_cov)
            #These steps need to happen only once per phenotype.
            #print(QS)
            lmm = LMM(phenotype, cov_matrix, QS)
            if not mixed:
                lmm.delta = 1
                lmm.fix('delta')
            #Prepare null model.
            lmm.fit(verbose=False)
            if regressCovariatesUpfront:
                phenotype_corrected = phenotype - cov_matrix[:, 1:].dot(
                    lmm.beta[1:])
                cov_matrix_corrected = cov_matrix[:, 0]
                lmm = LMM(phenotype_corrected, cov_matrix_corrected, QS)
                lmm.fit(verbose=False)

            null_lml = lmm.lml()
            flmm = lmm.get_fast_scanner()
            countChunker = 0
            for snpGroup in utils.chunker(snpQuery, blocksize):
                countChunker = countChunker + 1
                #print(countChunker)
                #Fix seed at the start of the first chunker so all permutations are based on the same random first split.
                np.random.seed(seed)
                #print(snpGroup)
                snp_idxs = snpGroup['i'].values
                snp_names = snpGroup['snp'].values

                tested_snp_ids.extend(snp_names)
                #subset genotype matrix, we cannot subselect at the same time, do in two steps.
                if (plinkGenotype):
                    snp_df = pd.DataFrame(
                        data=bed[snp_idxs, :].compute().transpose(),
                        index=fam.index,
                        columns=snp_names)
                else:
                    snp_df_dosage = pd.DataFrame(np.nan,
                                                 index=fam.index,
                                                 columns=snp_names)
                    snp_df = pd.DataFrame(np.nan,
                                          index=fam.index,
                                          columns=snp_names)
                    rowNumber = 0
                    for snpId in snp_idxs:
                        geno = bgen["genotype"][snpId].compute()
                        if (geno["ploidy"].min() > 1 & geno["ploidy"].max() <
                                3):
                            if (geno["phased"]):
                                snp_df_dosage_t = geno["probs"][:, [0, 2]].sum(
                                    1).astype(float)
                                snp_df_t = (np.abs(
                                    np.argmax(geno["probs"][:, :2], axis=1) - 1
                                ) + np.abs(
                                    np.argmax(geno["probs"][:, 2:4], axis=1) -
                                    1)).astype(float)
                                naId = (np.amax(geno["probs"][:, :2], 1) +
                                        np.amax(geno["probs"][:, 2:4], 1)) < (
                                            1 + minimumProbabilityStep)
                                snp_df_dosage_t[naId] = float('NaN')
                                snp_df_t[naId] = float('NaN')
                            else:
                                snp_df_dosage_t = (
                                    (geno["probs"][:, 0] * 2) +
                                    geno["probs"][:, 1]).astype(float)
                                snp_df_t = (np.abs(
                                    np.argmax(geno["probs"][:, :3], axis=1) -
                                    2)).astype(float)
                                naId = np.amax(geno["probs"][:, :3], 1) < (
                                    (1 / 3) + minimumProbabilityStep)
                                snp_df_dosage_t[naId] = float('NaN')
                                snp_df_t[naId] = float('NaN')
                            snp_df_dosage.loc[:, snp_names[
                                rowNumber]] = snp_df_dosage_t
                            snp_df.loc[:, snp_names[rowNumber]] = snp_df_t
                        rowNumber = rowNumber + 1
                    snp_df_dosage = snp_df_dosage.loc[individual_ids, :]

                snp_df = snp_df.loc[individual_ids, :]

                snp_df = snp_df.loc[:,
                                    np.unique(snp_df.columns)[
                                        np.unique(snp_df.columns,
                                                  return_counts=1)[1] == 1]]
                #SNP QC.
                if not contains_missing_samples:
                    #remove SNPs from snp_df if they have previously failed QC
                    snp_df = snp_df.loc[:,
                                        snp_df.columns[~snp_df.columns.
                                                       isin(fail_qc_snps_all)]]
                    if snp_df.shape[1] == 0:
                        continue
                    snps_to_test_df = snp_df.loc[:, snp_df.columns[
                        ~snp_df.columns.isin(pass_qc_snps_all)]]
                    if snps_to_test_df.shape[1] > 0:
                        #Only do QC on relevant SNPs. join pre-QCed list and new QCed list.
                        if kinship_df is not None:
                            passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                                snps_to_test_df.iloc[np.unique(
                                    snps_to_test_df.index,
                                    return_index=1)[1]].loc[
                                        geneticaly_unique_individuals, :],
                                min_call_rate, min_maf, min_hwe_P)
                        else:
                            passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                                snps_to_test_df, min_call_rate, min_maf,
                                min_hwe_P)
                        snps_to_test_df = None
                        #append snp_names and failed_snp_names
                        pass_qc_snps_all.extend(passed_snp_names)
                        fail_qc_snps_all.extend(failed_snp_names)
                    snp_df = snp_df.loc[:,
                                        snp_df.columns[snp_df.columns.
                                                       isin(pass_qc_snps_all)]]
                else:
                    #Do snp QC for relevant section.
                    #Get relevant slice from: phenotype_ds
                    if kinship_df is not None:
                        passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                            snp_df.iloc[np.unique(
                                snp_df.index, return_index=1)[1]].loc[
                                    geneticaly_unique_individuals, :],
                            min_call_rate, min_maf, min_hwe_P)
                    else:
                        passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                            snp_df, min_call_rate, min_maf, min_hwe_P)
                    snp_df = snp_df.loc[:,
                                        snp_df.columns[snp_df.columns.
                                                       isin(passed_snp_names)]]
                snpQcInfo_t = None
                if call_rate is not None:
                    snpQcInfo_t = call_rate
                    if maf is not None:
                        snpQcInfo_t = pd.concat(
                            [snpQcInfo_t,
                             maf.reindex(snpQcInfo_t.index)],
                            axis=1)
                        if hweP is not None:
                            snpQcInfo_t = pd.concat(
                                [snpQcInfo_t,
                                 hweP.reindex(snpQcInfo_t.index)],
                                axis=1)
                call_rate = None
                maf = None
                hweP = None
                if snpQcInfo is None and snpQcInfo_t is not None:
                    snpQcInfo = snpQcInfo_t
                elif snpQcInfo_t is not None:
                    snpQcInfo = pd.concat([snpQcInfo, snpQcInfo_t],
                                          axis=0,
                                          sort=False)
                ##First process SNPQc than check if we can continue.
                if len(snp_df.columns) == 0:
                    continue
                elif (not plinkGenotype):
                    snp_df_dosage = snp_df_dosage.loc[:,
                                                      np.unique(snp_df.columns
                                                                )]
                #We could make use of relatedness when imputing.  And impute only based on genetically unique individuals.
                snp_df = pd.DataFrame(fill_NaN.fit_transform(snp_df),
                                      index=snp_df.index,
                                      columns=snp_df.columns)
                if (not plinkGenotype):
                    snp_df_dosage = pd.DataFrame(
                        fill_NaN.fit_transform(snp_df_dosage),
                        index=snp_df_dosage.index,
                        columns=snp_df_dosage.columns)
                ##No more snp_matrix_DF > snp_df


#                test if the covariates, kinship, snp and phenotype are in the same order
                if (len(snp_df.index) != len(sample2individual_feature.loc[
                        phenotype_ds.index]['iid'])
                        or not all(snp_df.index == sample2individual_feature.
                                   loc[phenotype_ds.index]['iid'])):
                    print(
                        'There is an issue in mapping phenotypes and genotypes'
                    )
                    sys.exit()

                G = snp_df.values
                if (not plinkGenotype):
                    G = snp_df_dosage.values
                G = G.astype(float)
                G_index = snp_df.columns

                alt_lmls, effsizes = flmm.fast_scan(G, verbose=False)
                var_pvalues = lrt_pvalues(null_lml, alt_lmls)
                var_effsizes_se = effsizes_se(effsizes, var_pvalues)

                #add these results to qtl_results
                temp_df = pd.DataFrame(index=range(len(G_index)),
                                       columns=[
                                           'feature_id', 'snp_id', 'p_value',
                                           'beta', 'beta_se',
                                           'empirical_feature_p_value'
                                       ])
                temp_df['snp_id'] = G_index
                temp_df['feature_id'] = feature_id
                temp_df['beta'] = np.asarray(effsizes)
                temp_df['p_value'] = np.asarray(var_pvalues)
                temp_df['beta_se'] = np.asarray(var_effsizes_se)
                #insert default dummy value
                temp_df['empirical_feature_p_value'] = -1.0

                if (n_perm != 0):
                    pValueBuffer = []
                    totalSnpsToBeTested = (G.shape[1] * n_perm)
                    permutationStepSize = np.floor(
                        n_perm / (totalSnpsToBeTested / blocksize))
                    if (permutationStepSize > n_perm):
                        permutationStepSize = n_perm
                    elif (permutationStepSize < 1):
                        permutationStepSize = 1

                    if (write_permutations):
                        perm_df = pd.DataFrame(
                            index=range(len(G_index)),
                            columns=['snp_id'] +
                            ['permutation_' + str(x) for x in range(n_perm)])
                        perm_df['snp_id'] = G_index
                    for currentNperm in utils.chunker(
                            list(range(1, n_perm + 1)), permutationStepSize):
                        if (kinship_df is not None) and (relatedness_score
                                                         is not None):
                            if (plinkGenotype):
                                temp = utils.get_shuffeld_genotypes_preserving_kinship(
                                    geneticaly_unique_individuals,
                                    relatedness_score, snp_df,
                                    kinship_df.loc[individual_ids,
                                                   individual_ids],
                                    len(currentNperm))
                            else:
                                temp = utils.get_shuffeld_genotypes_preserving_kinship(
                                    geneticaly_unique_individuals,
                                    relatedness_score, snp_df_dosage,
                                    kinship_df.loc[individual_ids,
                                                   individual_ids],
                                    len(currentNperm))
                        else:
                            if (plinkGenotype):
                                temp = utils.get_shuffeld_genotypes(
                                    snp_df, len(currentNperm))
                            else:
                                temp = utils.get_shuffeld_genotypes(
                                    snp_df_dosage, len(currentNperm))
                        temp = temp.astype(float)
                        alt_lmls_p, effsizes_p = flmm.fast_scan(temp,
                                                                verbose=False)
                        var_pvalues_p = lrt_pvalues(null_lml, alt_lmls_p)
                        pValueBuffer.extend(np.asarray(var_pvalues_p))
                    if (not (len(pValueBuffer) == totalSnpsToBeTested)):
                        #print(len(pValueBuffer))
                        #print(pValueBuffer)
                        #print(totalSnpsToBeTested)
                        print('Error in blocking logic for permutations.')
                        sys.exit()
                    perm = 0
                    for relevantOutput in utils.chunker(
                            pValueBuffer, G.shape[1]):
                        if (write_permutations):
                            perm_df['permutation_' +
                                    str(perm)] = relevantOutput
                        if (bestPermutationPval[perm] > min(relevantOutput)):
                            bestPermutationPval[perm] = min(relevantOutput)
                        perm = perm + 1
                        #print(relevantOutput)
                        #print('permutation_'+str(perm))

                if not temp_df.empty:
                    data_written = True
                    output_writer.add_result_df(temp_df)
                    if (write_permutations):
                        permutation_writer.add_permutation_results_df(
                            perm_df, feature_id)
            #This we need to change in the written file.
        if (n_perm > 1 and data_written):
            #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id);
            alpha_para, beta_para = output_writer.apply_pval_correction(
                feature_id, bestPermutationPval, cis_mode)
            #np.savetxt(output_dir+"/Permutation.pValues."+feature_id+".txt",bestPermutationPval)
            alpha_params.append(alpha_para)
            beta_params.append(beta_para)
        if not data_written:
            fail_qc_features.append(feature_id)
        else:
            n_samples.append(phenotype_ds.size)
            n_e_samples.append(len(geneticaly_unique_individuals))
        if contains_missing_samples:
            QS = QS_tmp
            geneticaly_unique_individuals = tmp_unique_individuals
            del QS_tmp
            del tmp_unique_individuals
            if snpQcInfo is not None:
                snpQcInfo.index.name = "snp_id"
                snpQcInfo.to_csv(
                    output_dir +
                    '/snp_qc_metrics_naContaining_feature_{}.txt'.format(
                        feature_id),
                    sep='\t')
        else:
            if (snpQcInfo is not None and snpQcInfoMain is not None):
                snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo],
                                          axis=0,
                                          sort=False)
            elif snpQcInfo is not None:
                snpQcInfoMain = snpQcInfo.copy(deep=True)
        #if snpQcInfo is not None:
        #snpQcInfo2 = snpQcInfo.copy().transpose()
        #snpQcInfo2.to_csv(output_dir+'/snp_qc_metrics_feature_{}.txt'.format(feature_id),sep='\t')
        #print('step 5')
    output_writer.close()

    if (write_permutations):
        permutation_writer.close()
    fail_qc_features = np.unique(fail_qc_features)
    if ((len(feature_list) - len(fail_qc_features)) == 0):
        time.sleep(15)
        #Safety timer to make sure the file is unlocked.
        print("Trying to remove the h5 file. Nothing has been tested.")
        print(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
            chromosome, selectionStart, selectionEnd))
        if not selectionStart is None:
            os.remove(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
        else:
            os.remove(output_dir + 'qtl_results_{}.h5'.format(chromosome))
        sys.exit()
    #gather unique indexes of tested SNPs

    tested_snp_ids = list(set(tested_snp_ids))
    #write annotation and snp data to file
    snp_df = pd.DataFrame()
    snp_df['snp_id'] = bim['snp']
    snp_df['chromosome'] = bim['chrom']
    snp_df['position'] = bim['pos']
    snp_df['assessed_allele'] = bim['a1']
    snp_df.index = snp_df['snp_id']
    snp_df = snp_df.drop_duplicates()
    snp_df = snp_df.reindex(tested_snp_ids)
    snp_df = snp_df.drop_duplicates()

    if snpQcInfoMain is not None:
        snpQcInfoMain['index'] = snpQcInfoMain.index
        snpQcInfoMain = snpQcInfoMain.drop_duplicates()
        del snpQcInfoMain['index']
        snp_df = pd.concat(
            [snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1)

        if (snp_df.shape[1] == 5):
            snp_df.columns = [
                'snp_id', 'chromosome', 'position', 'assessed_allele',
                'call_rate'
            ]
        elif (snp_df.shape[1] == 6):
            snp_df.columns = [
                'snp_id', 'chromosome', 'position', 'assessed_allele',
                'call_rate', 'maf'
            ]
        else:
            snp_df.columns = [
                'snp_id', 'chromosome', 'position', 'assessed_allele',
                'call_rate', 'maf', 'hwe_p'
            ]

    feature_list = list(set(feature_list) - set(fail_qc_features))
    annotation_df = annotation_df.reindex(feature_list)
    annotation_df['n_samples'] = n_samples
    annotation_df['n_e_samples'] = n_e_samples

    if (n_perm > 1):
        annotation_df['alpha_param'] = alpha_params
        annotation_df['beta_param'] = beta_params

    if not selectionStart is None:
        snp_df.to_csv(output_dir + '/snp_metadata_{}_{}_{}.txt'.format(
            chromosome, selectionStart, selectionEnd),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}_{}_{}.txt'.format(
                                 chromosome, selectionStart, selectionEnd),
                             sep='\t')
    else:
        snp_df.to_csv(output_dir + '/snp_metadata_{}.txt'.format(chromosome),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}.txt'.format(chromosome),
                             sep='\t')
def run_plots(plinkGenotype, geno_prefix, annotation_filename, phenotype_filename,
              covariate_filename, top_qtl_results_filename, output_directory,
              sample_mapping_filename,randomeff_filename):

    # # Loading Files
    # phenotype_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Expression/Geuvadis_CEU_YRI_Expr.txt"
    # annotation_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Expression/Geuvadis_CEU_Annot.txt"
    # covariate_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Expression/Geuvadis_CEU_YRI_covariates.txt"
    # sample_mapping_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Geuvadis_CEU_gte.txt"
    # geno_prefix = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Genotypes/Geuvadis"
    # output_directory = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Output"
    # top_qtl_results_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Output/top_qtl_results_all_FDR0.05.txt"
    # plinkGenotype = True


    [phenotype_df, kinship_df, readdepth_df, covariate_df, sample2individual_df,complete_annotation_df, annotation_df, snp_filter_df,
     snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, bim, fam, bed, bgen,
     chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\
    utils.run_QTL_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=phenotype_filename,
                    anno_filename=annotation_filename, geno_prefix=geno_prefix, plinkGenotype=plinkGenotype, cis_mode=True,
                    skipAutosomeFiltering = False, minimum_test_samples= 10,
                    relatedness_score=None, snps_filename=None, feature_filename=None,
                    snp_feature_filename=None, selection='all',covariates_filename=covariate_filename,
                    randomeff_filename=randomeff_filename, sample_mapping_filename=sample_mapping_filename,
                    extended_anno_filename=None, feature_variant_covariate_filename=None)

    # results
    top_qtl_results_df = qtl_loader_utils.get_top_qtl_results(top_qtl_results_filename)

    for row in top_qtl_results_df.iterrows():

        # feature specific parameters for QS mixing
        rho1 = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
        Sigma = {}
        Sigma_qs = {}
        best = {}
        snpQcInfo = None
        currentFeatureNumber+= 1

        phenotype_ds = phenotype_df.loc[row[1]["feature_id"]]
        individual_ids = sample2individual_df.loc[phenotype_ds.index, 'iid'].values
        sample2individual_feature = sample2individual_df.loc[phenotype_ds.index]

        if kinship_df is not None and readdepth_df is None:
            kinship_mat = kinship_df.loc[individual_ids,individual_ids].values
            kinship_mat = kinship_mat.astype(float)
            ##GOWER normalization of Kinship matrix.
            kinship_mat *= (kinship_mat.shape[0] - 1) / (kinship_mat.trace() - kinship_mat.mean(0).sum())
            ## This needs to go with the subselection stuff.
            if(QS is None and not contains_missing_samples):
                QS = economic_qs(kinship_mat)
            elif (contains_missing_samples):
                QS_tmp = QS
                QS = economic_qs(kinship_mat)

        randomeff_mix = False
        # combining the two matrices
        if kinship_df is not None and readdepth_df is not None:
            randomeff_mix = True
            kinship_mat = kinship_df.loc[individual_ids,individual_ids].values
            kinship_mat = kinship_mat.astype(float)
            ##GOWER normalization of Kinship matrix.
            kinship_mat *= (kinship_mat.shape[0] - 1) / (kinship_mat.trace() - kinship_mat.mean(0).sum())
            for rho in rho1:
                Sigma[rho] = rho * kinship_df + (1 - rho) * readdepth_df
                Sigma_qs[rho] = economic_qs(Sigma[rho])

        # creating a fake QS if none random effect is present or use the read depth one
        if kinship_df is None:
            if readdepth_df is None:
                K = np.eye(len(phenotype_ds.index))
                if(QS is None and not contains_missing_samples):
                    QS = economic_qs(K)
                elif (contains_missing_samples):
                    QS_tmp = QS
                    QS = economic_qs(K)
            else:
                if(QS is None and not contains_missing_samples):
                    QS = economic_qs(readdepth_df)
                elif (contains_missing_samples):
                    QS_tmp = QS
                    QS = economic_qs(readdepth_df)

        # covariance matrix
        cov_matrix = covariate_df.loc[sample2individual_feature['sample'], :].values if covariate_df is not None else None
        if covariate_df is None:
            cov_matrix = np.ones((len(individual_ids), 1))
        cov_matrix = cov_matrix.astype(float)

        phenotype = utils.force_normal_distribution(phenotype_ds.values,method="gaussnorm")

        # Prepare LMM
        phenotype = phenotype.astype(float)

        if randomeff_mix:
            # initialize best to minus infinite
            best["lml"] = - math.inf
            best["lmm"] = - math.inf
            best["rho1"] = - math.inf
            for rho, QS in Sigma_qs.items():
                lmm = LMM(phenotype, cov_matrix, QS)
                if not mixed:
                    lmm.delta = 1
                    lmm.fix('delta')
                lmm.fit(verbose=False)
                lml = lmm.lml()
                if lml > best["lml"]:
                    best["lml"] = lml
                    best["lmm"] = lmm
                    best["rho1"] = rho
            lmm = best["lmm"]
            print(best["rho1"])
            if best["rho1"] != 0:
                rho_log[(feature_id)] = best["rho1"]
                print("Read depth has actually an effect!")

        else:
            lmm = LMM(phenotype, cov_matrix, QS)
            if not mixed:
                lmm.delta = 1
                lmm.fix('delta')
            lmm.fit(verbose=False)

        # create phenotype_corrected_df for plotting
        phenotype_corrected = phenotype - cov_matrix[:, 1:].dot(lmm.beta[1:])
        phenotype_corrected_ds = pd.Series(data = phenotype_corrected, index = phenotype_ds.index, name="exp")

        qtl_plots(row, phenotype_ds, phenotype_corrected_ds, plinkGenotype, bim, fam, bed, annotation_df, sample2individual_df)
Ejemplo n.º 3
0
def _test_lmm(random, y, X, G, mvn, restricted):
    c = X.shape[1]
    QS = economic_qs_linear(G)
    lmm = LMM(y, X, QS, restricted=restricted)
    beta = lmm.beta
    v0 = lmm.v0
    v1 = lmm.v1

    K0 = G @ G.T
    assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0))

    beta = random.randn(c)
    lmm.beta = beta
    assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0))

    delta = random.rand(1).item()
    lmm.delta = delta
    v0 = lmm.v0
    v1 = lmm.v1
    assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0))

    scale = random.rand(1).item()
    lmm.scale = scale
    v0 = lmm.v0
    v1 = lmm.v1
    assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0))

    def fun(x):
        beta = x[:c]
        v0 = exp(x[c])
        v1 = exp(x[c + 1])
        return -mvn(beta, v0, v1, y, X, K0)

    res = minimize(fun, [0] * c + [0, 0])
    lmm.fit(verbose=False)
    assert_allclose(lmm.lml(), -res.fun, rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.beta, res.x[:c], rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.v0, exp(res.x[c]), rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.v1, exp(res.x[c + 1]), rtol=1e-3, atol=1e-6)

    lmm = LMM(y, X, QS, restricted=restricted)
    beta = random.randn(c)
    lmm.beta = beta
    lmm.delta = random.rand(1).item()
    lmm.scale = random.rand(1).item()
    lmm.fix("beta")

    def fun(x):
        v0 = exp(x[0])
        v1 = exp(x[1])
        return -mvn(beta, v0, v1, y, X, K0)

    res = minimize(fun, [0, 0])
    lmm.fit(verbose=False)
    assert_allclose(lmm.lml(), -res.fun, rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.v0, exp(res.x[0]), rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.v1, exp(res.x[1]), rtol=1e-3, atol=1e-6)

    lmm = LMM(y, X, QS, restricted=restricted)
    lmm.beta = random.randn(c)
    delta = random.rand(1).item()
    lmm.delta = delta
    lmm.scale = random.rand(1).item()
    lmm.fix("delta")

    def fun(x):
        beta = x[:c]
        scale = exp(x[c])
        v0 = scale * (1 - delta)
        v1 = scale * delta
        return -mvn(beta, v0, v1, y, X, K0)

    res = minimize(fun, [0] * c + [0])
    lmm.fit(verbose=False)
    assert_allclose(lmm.lml(), -res.fun, rtol=1e-5, atol=1e-6)
    assert_allclose(lmm.beta, res.x[:c], rtol=1e-5, atol=1e-6)
    assert_allclose(lmm.scale, exp(res.x[c]), rtol=1e-5, atol=1e-6)

    lmm = LMM(y, X, QS, restricted=restricted)
    lmm.beta = random.randn(c)
    lmm.delta = random.rand(1).item()
    scale = random.rand(1).item()
    lmm.scale = scale
    lmm.fix("scale")

    def fun(x):
        beta = x[:c]
        delta = 1 / (1 + exp(-x[c]))
        v0 = scale * (1 - delta)
        v1 = scale * delta
        return -mvn(beta, v0, v1, y, X, K0)

    res = minimize(fun, [0] * c + [0])
    lmm.fit(verbose=False)
    assert_allclose(lmm.lml(), -res.fun, rtol=1e-5, atol=1e-6)
    assert_allclose(lmm.beta, res.x[:c], rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.delta, 1 / (1 + exp(-res.x[c])), rtol=1e-3, atol=1e-6)
Ejemplo n.º 4
0
def run_PrsQtl_analysis(pheno_filename,
                        anno_filename,
                        prsFile,
                        output_dir,
                        min_call_rate=0.95,
                        blocksize=1000,
                        skipAutosomeFiltering=False,
                        gaussianize_method=None,
                        minimum_test_samples=10,
                        seed=np.random.randint(40000),
                        n_perm=0,
                        write_permutations=False,
                        relatedness_score=None,
                        feature_variant_covariate_filename=None,
                        snps_filename=None,
                        feature_filename=None,
                        snp_feature_filename=None,
                        genetic_range='all',
                        covariates_filename=None,
                        kinship_filename=None,
                        sample_mapping_filename=None,
                        regressCovariatesUpfront=False):
    fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=0)
    print('Running GRS QT analysis.')
    lik = 'normal'
    '''Core function to take input and run QTL tests on a given chromosome.'''
    if relatedness_score is not None:
        relatedness_score = float(relatedness_score)
    [phenotype_df, kinship_df, covariate_df, sample2individual_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, risk_df, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\
    utils.run_PrsQtl_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename, anno_filename=anno_filename, prsFile=prsFile, skipAutosomeFiltering = skipAutosomeFiltering,
                      minimum_test_samples= minimum_test_samples,  relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename, snp_feature_filename=snp_feature_filename, selection=genetic_range,
                     covariates_filename=covariates_filename, kinship_filename=kinship_filename, sample_mapping_filename=sample_mapping_filename, feature_variant_covariate_filename=feature_variant_covariate_filename)

    mixed = kinship_df is not None
    if (kinship_df is None) or (relatedness_score is None):
        geneticaly_unique_individuals = sample2individual_df['iid'].values
    QS = None
    if (feature_list == None or len(feature_list) == 0):
        print('No features to be tested.')
        sys.exit()

    #Open output files
    qtl_loader_utils.ensure_dir(output_dir)
    if not selectionStart is None:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
    else:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}.h5'.format(chromosome))
    if (write_permutations):
        if not selectionStart is None:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}_{}_{}.h5'.format(
                    chromosome, selectionStart, selectionEnd), n_perm)
        else:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}.h5'.format(chromosome), n_perm)

    #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness.
    tested_snp_names = []
    fail_qc_features = []
    alpha_params = []
    beta_params = []
    n_samples = []
    n_e_samples = []
    na_containing_features = 0
    currentFeatureNumber = 0
    snpQcInfoMain = None
    for feature_id in feature_list:
        snpQcInfo = None
        currentFeatureNumber += 1
        if (len(phenotype_df.loc[feature_id, :])) < minimum_test_samples:
            print("Feature: " + feature_id +
                  " not tested not enough samples do QTL test.")
            fail_qc_features.append(feature_id)
            geneticaly_unique_individuals = tmp_unique_individuals
            continue
        data_written = False
        contains_missing_samples = False
        snpQuery = risk_df.index.values
        snp_cov_df = None

        if (feature_variant_covariate_df is not None):
            if (feature_id in feature_variant_covariate_df['feature'].values):
                covariateSnp = feature_variant_covariate_df['snp_id'].values[
                    feature_variant_covariate_df['feature'] == feature_id]
                if (any(i in risk_df.index.values for i in covariateSnp)):
                    snp_cov_df = risk_df.loc[risk_df.index.map(
                        lambda x: x in list(covariateSnp)), :].transpose()

        if (len(snpQuery) != 0) and (snp_filter_df is not None):
            snpQuery = list(
                set(snp_filter_df.index).intersection(set(snpQuery)))

        if (len(snpQuery) != 0) and (snp_feature_filter_df is not None):
            snpQuery = list(
                set(
                    np.unique(snp_feature_filter_df['snp_id'].loc[
                        snp_feature_filter_df['feature'] ==
                        feature_id])).intersection(set(snpQuery)))

        if len(snpQuery) == 0:
            print("Feature: " + feature_id +
                  " not tested. No SNPS passed QC for phenotype.")
            fail_qc_features.append(feature_id)
            continue
        else:
            phenotype_ds = phenotype_df.loc[feature_id]
            contains_missing_samples = any(~np.isfinite(phenotype_ds))
            if (contains_missing_samples):
                #import pdb; pdb.set_trace()
                print('Feature: ' + feature_id + ' contains missing data.')
                phenotype_ds.dropna(inplace=True)
                na_containing_features = na_containing_features + 1
            '''select indices for relevant individuals in genotype matrix
            These are not unique. NOT to be used to access phenotype/covariates data
            '''
            individual_ids = sample2individual_df.loc[phenotype_ds.index,
                                                      'iid'].values
            sample2individual_feature = sample2individual_df.loc[
                phenotype_ds.index]

            if contains_missing_samples:
                tmp_unique_individuals = geneticaly_unique_individuals
                if (kinship_df is not None) and (relatedness_score
                                                 is not None):
                    geneticaly_unique_individuals = utils.get_unique_genetic_samples(
                        kinship_df.loc[individual_ids, individual_ids],
                        relatedness_score)
                else:
                    geneticaly_unique_individuals = individual_ids
            if phenotype_ds.empty or len(
                    geneticaly_unique_individuals) < minimum_test_samples:
                print("Feature: " + feature_id +
                      " not tested not enough samples do QTL test.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue
            elif np.var(phenotype_ds.values) == 0:
                print("Feature: " + feature_id +
                      " has no variance in selected individuals.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue

            print('For feature: ' + str(currentFeatureNumber) + '/' +
                  str(len(feature_list)) + ' (' + feature_id + '): ' +
                  str(len(snpQuery)) +
                  ' risk scores will be tested.\n Please stand by.')
            if (n_perm != 0):
                bestPermutationPval = np.ones((n_perm), dtype=np.float)

            #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix.


#                test if the covariates, kinship, snp and phenotype are in the same order
            if ((all(kinship_df.loc[individual_ids,individual_ids].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\
                 (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index)if covariate_df is not None else True)):
                '''
                if all lines are in order put in arrays the correct genotype and phenotype
                x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b;                 better readability of the code
                 '''
                if kinship_df is not None:
                    kinship_mat = kinship_df.loc[individual_ids,
                                                 individual_ids].values
                    kinship_mat = kinship_mat.astype(float)
                    ##GOWER normalization of Kinship matrix.
                    kinship_mat *= (kinship_mat.shape[0] - 1) / (
                        kinship_mat.trace() - kinship_mat.mean(0).sum())
                    ## This needs to go with the subselection stuff.
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(kinship_mat)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(kinship_mat)
                if kinship_df is None:
                    K = np.eye(len(phenotype_ds.index))
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(K)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(K)
                cov_matrix = covariate_df.loc[sample2individual_feature[
                    'sample'], :].values if covariate_df is not None else None
                if covariate_df is None:
                    cov_matrix = np.ones((len(individual_ids), 1))
                #pdb.set_trace()
                if snp_cov_df is not None:
                    snp_cov_df_tmp = snp_cov_df.loc[individual_ids, :]
                    snp_cov_df = pd.DataFrame(
                        fill_NaN.fit_transform(snp_cov_df_tmp))
                    snp_cov_df.index = sample2individual_feature['sample']
                    snp_cov_df.columns = snp_cov_df_tmp.columns
                    cov_matrix = np.concatenate(
                        (cov_matrix, snp_cov_df.values), 1)
                    snp_cov_df_tmp = None
                    snp_cov_df = None
                cov_matrix = cov_matrix.astype(float)
            else:
                print(
                    'There is an issue in mapping phenotypes vs covariates and/or kinship'
                )
                sys.exit()

            phenotype = utils.force_normal_distribution(
                phenotype_ds.values, method=gaussianize_method
            ) if gaussianize_method is not None else phenotype_ds.values

            #Prepare LMM
            phenotype = phenotype.astype(float)

            ##Mixed and test.
            ##This is a future change so we don't need to decompose the COVs every time.
            ##Like QS this needs to happen when genetic unique individuals is the same.
            #svd_cov = economic_svd(cov_matrix)
            #lmm = LMM(phenotype, cov_matrix, QS, SVD=svd_cov)
            #These steps need to happen only once per phenotype.
            #print(QS)
            lmm = LMM(phenotype, cov_matrix, QS)
            if not mixed:
                lmm.delta = 1
                lmm.fix('delta')
            #Prepare null model.
            lmm.fit(verbose=False)
            if regressCovariatesUpfront:
                phenotype_corrected = phenotype - cov_matrix[:, 1:].dot(
                    lmm.beta[1:])
                cov_matrix_corrected = cov_matrix[:, 0]
                lmm = LMM(phenotype_corrected, cov_matrix_corrected, QS)
                lmm.fit(verbose=False)

            null_lml = lmm.lml()
            flmm = lmm.get_fast_scanner()
            #pdb.set_trace();
            for snpGroup in utils.chunker(snpQuery, blocksize):
                #Fix seed at the start of the first chunker so all permutations are based on the same random first split.
                np.random.seed(seed)

                snp_names = snpGroup

                tested_snp_names.extend(snp_names)
                snp_matrix_DF = risk_df.loc[snp_names,
                                            individual_ids].transpose()
                ##GRS var QC
                snp_matrix_DF = snp_matrix_DF.loc[:,
                                                  snp_matrix_DF.isna().sum(
                                                      axis=0) != snp_matrix_DF.
                                                  shape[0], ]
                snp_matrix_DF = snp_matrix_DF.loc[:, (
                    np.nanstd(snp_matrix_DF, axis=0) > 0)]
                #               test if the covariates, kinship, snp and phenotype are in the same order
                if (len(snp_matrix_DF.index) != len(
                        sample2individual_feature.loc[phenotype_ds.index]
                    ['iid']) or not all(
                        snp_matrix_DF.index == sample2individual_feature.loc[
                            phenotype_ds.index]['iid'])):
                    print(
                        'There is an issue in mapping phenotypes and genotypes'
                    )
                    sys.exit()
                #Impute missingness
                #pdb.set_trace()
                call_rate = 1 - snp_matrix_DF.isnull().sum() / len(
                    snp_matrix_DF.index)
                if snpQcInfo is None and call_rate is not None:
                    snpQcInfo = call_rate
                elif call_rate is not None:
                    snpQcInfo = pd.concat([snpQcInfo, call_rate], axis=0)
                selection = call_rate > min_call_rate
                snp_matrix_DF = snp_matrix_DF.loc[:,
                                                  list(snp_matrix_DF.
                                                       columns[selection])]
                if snp_matrix_DF.shape[1] == 0:
                    continue
                snp_matrix_DF = pd.DataFrame(
                    fill_NaN.fit_transform(snp_matrix_DF),
                    index=snp_matrix_DF.index,
                    columns=snp_matrix_DF.columns)
                #
                G = snp_matrix_DF.values
                G = G.astype(float)
                G_index = snp_matrix_DF.columns

                alt_lmls, effsizes = flmm.fast_scan(G, verbose=False)
                var_pvalues = lrt_pvalues(null_lml, alt_lmls)
                var_effsizes_se = effsizes_se(effsizes, var_pvalues)

                #add these results to qtl_results
                temp_df = pd.DataFrame(index=range(len(G_index)),
                                       columns=[
                                           'feature_id', 'snp_id', 'p_value',
                                           'beta', 'beta_se',
                                           'empirical_feature_p_value'
                                       ])
                temp_df['snp_id'] = G_index
                temp_df['feature_id'] = feature_id
                temp_df['beta'] = np.asarray(effsizes)
                temp_df['p_value'] = np.asarray(var_pvalues)
                temp_df['beta_se'] = np.asarray(var_effsizes_se)
                #insert default dummy value
                temp_df['empirical_feature_p_value'] = -1.0

                if (n_perm != 0):
                    pValueBuffer = []
                    totalSnpsToBeTested = (G.shape[1] * n_perm)
                    permutationStepSize = np.floor(
                        n_perm / (totalSnpsToBeTested / blocksize))
                    if (permutationStepSize > n_perm):
                        permutationStepSize = n_perm
                    elif (permutationStepSize < 1):
                        permutationStepSize = 1

                    if (write_permutations):
                        perm_df = pd.DataFrame(
                            index=range(len(G_index)),
                            columns=['snp_id'] +
                            ['permutation_' + str(x) for x in range(n_perm)])
                        perm_df['snp_id'] = G_index
                    for currentNperm in utils.chunker(
                            list(range(1, n_perm + 1)), permutationStepSize):
                        if (kinship_df is not None) and (relatedness_score
                                                         is not None):
                            temp = utils.get_shuffeld_genotypes_preserving_kinship(
                                geneticaly_unique_individuals,
                                relatedness_score, snp_matrix_DF,
                                kinship_df.loc[individual_ids, individual_ids],
                                len(currentNperm))
                        else:
                            temp = utils.get_shuffeld_genotypes(
                                snp_matrix_DF, len(currentNperm))
                        temp = temp.astype(float)
                        alt_lmls_p, effsizes_p = flmm.fast_scan(temp,
                                                                verbose=False)
                        var_pvalues_p = lrt_pvalues(null_lml, alt_lmls_p)
                        pValueBuffer.extend(np.asarray(var_pvalues_p))
                    if (not (len(pValueBuffer) == totalSnpsToBeTested)):
                        #print(len(pValueBuffer))
                        #print(pValueBuffer)
                        #print(totalSnpsToBeTested)
                        print('Error in blocking logic for permutations.')
                        sys.exit()
                    perm = 0
                    for relevantOutput in utils.chunker(
                            pValueBuffer, G.shape[1]):
                        if (write_permutations):
                            perm_df['permutation_' +
                                    str(perm)] = relevantOutput
                        if (bestPermutationPval[perm] > min(relevantOutput)):
                            bestPermutationPval[perm] = min(relevantOutput)
                        perm = perm + 1
                        #print(relevantOutput)
                        #print('permutation_'+str(perm))

                if not temp_df.empty:
                    data_written = True
                    output_writer.add_result_df(temp_df)
                    if (write_permutations):
                        permutation_writer.add_permutation_results_df(
                            perm_df, feature_id)
            #This we need to change in the written file.
            if (n_perm > 1 and data_written):
                #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id);
                alpha_para, beta_para = output_writer.apply_pval_correction(
                    feature_id, bestPermutationPval, False)
                alpha_params.append(alpha_para)
                beta_params.append(beta_para)
                #pdb.set_trace();
            if not data_written:
                fail_qc_features.append(feature_id)
            else:
                n_samples.append(phenotype_ds.size)
                n_e_samples.append(len(geneticaly_unique_individuals))
            if contains_missing_samples:
                QS = QS_tmp
                geneticaly_unique_individuals = tmp_unique_individuals
                snpQcInfo = snpQcInfo.to_frame(name="call_rate")
                snpQcInfo.index.name = "snp_id"
                snpQcInfo.to_csv(
                    output_dir +
                    '/snp_qc_metrics_naContaining_feature_{}.txt'.format(
                        feature_id),
                    sep='\t')
                del QS_tmp
                del tmp_unique_individuals
            else:
                if (snpQcInfo is not None and snpQcInfoMain is not None):
                    snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo],
                                              axis=0)
                elif snpQcInfo is not None:
                    snpQcInfoMain = snpQcInfo.copy(deep=True)
                #print('step 5')
    output_writer.close()

    if (write_permutations):
        permutation_writer.close()
    fail_qc_features = np.unique(fail_qc_features)
    if ((len(feature_list) - len(fail_qc_features)) == 0):
        time.sleep(15)
        #Safety timer to make sure the file is unlocked.
        print("Trying to remove the h5 file. Nothing has been tested.")
        print(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
            chromosome, selectionStart, selectionEnd))
        if not selectionStart is None:
            os.remove(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
        else:
            os.remove(output_dir + 'qtl_results_{}.h5'.format(chromosome))
        sys.exit()
    #gather unique indexes of tested snps
    #write annotation and snp data to file
    snp_df = pd.DataFrame()
    snp_df['snp_id'] = np.unique(tested_snp_names)
    snp_df.index = np.unique(tested_snp_names)
    snp_df['chromosome'] = "NA"
    snp_df['position'] = "NA"
    if (snpQcInfoMain is not None):
        snpQcInfoMain = snpQcInfoMain.to_frame(name="call_rate")
        snpQcInfoMain['index'] = snpQcInfoMain.index
        snpQcInfoMain = snpQcInfoMain.drop_duplicates()
        del snpQcInfoMain['index']
        snp_df = pd.concat(
            [snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1)

    feature_list = list(set(feature_list) - set(fail_qc_features))
    annotation_df = annotation_df.reindex(feature_list)
    annotation_df['n_samples'] = n_samples
    annotation_df['n_e_samples'] = n_e_samples
    if (n_perm > 1):
        annotation_df['alpha_param'] = alpha_params
        annotation_df['beta_param'] = beta_params
    if not selectionStart is None:
        snp_df.to_csv(output_dir + '/snp_metadata_{}_{}_{}.txt'.format(
            chromosome, selectionStart, selectionEnd),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}_{}_{}.txt'.format(
                                 chromosome, selectionStart, selectionEnd),
                             sep='\t')
    else:
        snp_df.to_csv(output_dir + '/snp_metadata_{}.txt'.format(chromosome),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}.txt'.format(chromosome),
                             sep='\t')
def run_QTL_analysis(pheno_filename, anno_filename, geno_prefix, plinkGenotype, output_dir, window_size=250000,
                     min_maf=0.05, min_hwe_P=0.001, min_call_rate=None, blocksize=1000, cis_mode=True,
                     skipAutosomeFiltering = False, gaussianize_method=None, minimum_test_samples= 10,
                     seed=np.random.randint(40000), n_perm=0, write_permutations = False, write_feature_top_permutations = False,
                     relatedness_score=0.95, feature_variant_covariate_filename = None, snps_filename=None, feature_filename=None,
                     snp_feature_filename=None, genetic_range='all', covariates_filename=None, randomeff_filename=None,
                     sample_mapping_filename=None, extended_anno_filename=None, regressCovariatesUpfront = False, debugger=False):
    #Manual flag to set pearson (True), spearman (False). TODO add rank as an option to gaussnorm.
    pearson=True
    
    if regressCovariatesUpfront is not None:
        #This implementation can only handle regression before the association test (correlation).
        regressCovariatesUpfront= True
    tot_time = 0
    idx = 0
    
    print(relatedness_score)

    fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=0, copy=False)
    print('Running QTL analysis.')
    lik = 'normal'
    minimumProbabilityStep=0.1

    '''Core function to take input and run QTL tests on a given chromosome.'''
    
    # Check if relatedness_score is present as a measure of genotype similarity and hence, of sample similarity.
    if relatedness_score is not None:
        relatedness_score = float(relatedness_score)

    # Intersect files together to list the amount of samples with enough files
    if debugger:
        fun_start = time.time()
    [phenotype_df, kinship_df, randomeff_df, covariate_df, sample2individual_df,complete_annotation_df, annotation_df, snp_filter_df,
     snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, bim, fam, bed, bgen,
     chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\
    utils.run_QTL_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename,
                    anno_filename=anno_filename, geno_prefix=geno_prefix, plinkGenotype=plinkGenotype, cis_mode=cis_mode,
                    skipAutosomeFiltering = skipAutosomeFiltering, minimum_test_samples= minimum_test_samples,
                    relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename,
                    snp_feature_filename=snp_feature_filename, selection=genetic_range,covariates_filename=covariates_filename,
                    randomeff_filename=randomeff_filename, sample_mapping_filename=sample_mapping_filename,
                    extended_anno_filename=extended_anno_filename, feature_variant_covariate_filename=feature_variant_covariate_filename)
    if debugger:
        fun_end = time.time()
        print(" Intersecting files took {}".format(fun_end-fun_start))

    # Check if kinship matrix is present. The matrix of pairwise genotype similarity. If they are not present took genetically unique individuals based on IDs
    mixed = kinship_df is not None
    if (kinship_df is None) or (relatedness_score is None) :
        geneticaly_unique_individuals = sample2individual_df['iid'].values
    QS = None
    
    # Check if feature list is empty (genes)
    if(feature_list==None or len(feature_list)==0):
        print ('No features to be tested.')
        sys.exit()

    #Open output files
    if debugger:
        fun_start = time.time()
    qtl_loader_utils.ensure_dir(output_dir)
    if not selectionStart is None :
        output_writer = qtl_output.hdf5_writer(output_dir+'/qtl_results_{}_{}_{}.h5'.format(chromosome,selectionStart,selectionEnd))
    else :
        output_writer = qtl_output.hdf5_writer(output_dir+'/qtl_results_{}.h5'.format(chromosome))
    if(write_permutations):
        if not selectionStart is None :
            permutation_writer = qtl_output.hdf5_permutations_writer(output_dir+'/perm_results_{}_{}_{}.h5'.format(chromosome,selectionStart,selectionEnd),n_perm)
        else :
            permutation_writer = qtl_output.hdf5_permutations_writer(output_dir+'/perm_results_{}.h5'.format(chromosome),n_perm)
    if debugger:
        fun_end = time.time()
        print(" Opening writing files took {}".format(fun_end-fun_start))

    #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness.
    tested_snp_ids = []
    pass_qc_snps_all = []
    fail_qc_snps_all = []
    fail_qc_features = []
    alpha_params = []
    beta_params = []
    n_samples = []
    n_e_samples = []
    random_eff_param = []
    na_containing_features=0
    currentFeatureNumber=0
    snpQcInfoMain = None
    random_eff_param = []
    log = {}
    rho1 = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
    Sigma = {}
    Sigma_qs = {}
    randomeff_mix = False
    
    #############################################################################################################################################################
    # Per feature LMM fitting
    #############################################################################################################################################################

    # start per feature computations
    for feature_id in feature_list:
        gc.collect()
        start_time = time.time()
        # log file production for rho values storing and computation time
        log[(feature_id)] = []
        # feature specific parameters for QS mixing
        mixingParameters = {}
        feature_best_rho = -1
        snpQcInfo = None
        # counter
        currentFeatureNumber+= 1

        ########################################################################################################################################################
        # check if enough phenotype samples to test this gene
        if (len(phenotype_df.loc[feature_id,:]))<minimum_test_samples:
            print("Feature: "+feature_id+" not tested not enough samples do QTL test (n="+str(len(phenotype_df.loc[feature_id,:]))+").")
            fail_qc_features.append(feature_id)
            geneticaly_unique_individuals = tmp_unique_individuals
            continue
        data_written = False
        contains_missing_samples = False
        ########################################################################################################################################################

        ########################################################################################################################################################
        # SNP selection based on gene location and window size
        if debugger:
            fun_start = time.time()
        snpQuery = utils.do_snp_selection(feature_id, complete_annotation_df, bim, cis_mode, window_size, skipAutosomeFiltering)
        if debugger:
            fun_end = time.time()
            print("SNP querying took {}".format(fun_end-fun_start))

        snp_cov_df = None
        if debugger:
            fun_start = time.time()
        ########################################################################################################################################################

        #########################################################################################################################################################
        # Check if a matrix of variant covariance is present. Like for example PCs covariates -- Understand better covariates to SNP
        if(feature_variant_covariate_df is not None):
            if(feature_id in feature_variant_covariate_df['feature_id'].values):
                # array of covariates per SNP and feature
                covariateSnp = feature_variant_covariate_df['snp_id'].values[feature_variant_covariate_df['feature_id']==feature_id]
                if(any(i in  bim['snp'].values for i in covariateSnp)):
                    snpQuery_cov = bim.loc[bim['snp'].map(lambda x: x in list(covariateSnp)),:]
                    if(plinkGenotype):
                        snp_cov_df = pd.DataFrame(data=bed[snpQuery_cov['i'].values,:].compute().transpose(),index=fam.index,columns=snpQuery_cov['snp'],)
                    else:
                        ##Here we make some assumptions on the SNPs. They are expected to be ploidy 2!
                        ##Also we don't use a minimal quality to assure a value is present for all samples.
                        print('Warning, during the regression of SNPs we assume ploidy 2.')
                        snp_cov_df_t = pd.DataFrame(columns=fam.index)
                        rowNumber = 0
                        for snpId in snpQuery_cov['i'] :
                            geno = bgen["genotype"][snpId].compute()
                            if(geno["phased"]):
                                snp_df_dosage_t = geno["probs"][:,[0,2]].sum(1).astype(float)
                                snp_df_dosage_t[(np.amax(geno["probs"][:,:2],1)+np.amax(geno["probs"][:,2:4],1))<(1+minimumProbabilityStep)] = float('NaN')
                            else :
                                snp_df_dosage_t = (geno["probs"][:,0]* 2)+geno["probs"][:,1]
                                snp_df_dosage_t[np.amax(geno["probs"][:,:3],1)<((1/3)+minimumProbabilityStep)] = float('NaN')
                            snp_df_dosage_t = pd.Series(snp_df_dosage_t, index= fam.index)
                            snp_df_dosage_t.name = snpId
                            snp_cov_df_t = snp_cov_df_t.append(snp_df_dosage_t)
                            rowNumber = rowNumber +1
                        snp_cov_df = snp_cov_df_t.transpose()
                        snp_cov_df_t = None
        if debugger:
            fun_end = time.time()
            print(" Selecting feature variant covariate took {}".format(fun_end-fun_start))
        ########################################################################################################################################################

        ########################################################################################################################################################
        # Check the number of SNP to be tested and look if there is some SNP or feature filtering requirement
        if (len(snpQuery) != 0) and (snp_filter_df is not None):
            toSelect = set(snp_filter_df.index).intersection(set(snpQuery['snp']))
            snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)]

        if (len(snpQuery) != 0) and (snp_feature_filter_df is not None):
            toSelect = set(np.unique(snp_feature_filter_df['snp_id'].loc[snp_feature_filter_df['feature_id']==feature_id])).intersection(set(snpQuery['snp']))
            snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)]

        if len(snpQuery) == 0:
            print("Feature: "+feature_id+" not tested. No SNPS passed QC for phenotype.")
            fail_qc_features.append(feature_id)
            continue   
        ########################################################################################################################################################

        else:
            # selecting phenotype array
            phenotype_ds = phenotype_df.loc[feature_id]
            contains_missing_samples = any(~np.isfinite(phenotype_ds))
            #####################################################################################################################################################
            # check for missing samples according to specific feature otherwise use previous SNP QC information
            if(contains_missing_samples):
                print ('Feature: ' + feature_id + ' contains missing data.')
                phenotype_ds.dropna(inplace=True)
                na_containing_features = na_containing_features+1
            '''select indices for relevant individuals in genotype matrix
            These are not unique. NOT to be used to access phenotype/covariates data
            '''
            individual_ids = sample2individual_df.loc[phenotype_ds.index,'iid'].values
            sample2individual_feature= sample2individual_df.loc[phenotype_ds.index]
            
            if(contains_missing_samples):
                tmp_unique_individuals = geneticaly_unique_individuals
                if (kinship_df is not None) and (relatedness_score is not None):
                    geneticaly_unique_individuals = utils.get_unique_genetic_samples(kinship_df.loc[individual_ids,individual_ids], relatedness_score);
                else:
                    geneticaly_unique_individuals = individual_ids
            #####################################################################################################################################################

            else:
                #If no missing samples we can use the previous SNP QC information before actually loading data.
                #This allows for more efficient blocking and retrieving of data
                snpQuery = snpQuery.loc[snpQuery['snp'].map(lambda x: x not in list(map(str, fail_qc_snps_all)))]

            #####################################################################################################################################################
            # check for enough samples for QTL test
            if phenotype_ds.empty or len(geneticaly_unique_individuals)<minimum_test_samples :
                print("Feature: "+feature_id+" not tested not enough samples do QTL test.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue
            elif np.var(phenotype_ds.values) == 0:
                print("Feature: "+feature_id+" has no variance in selected individuals.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue
            #####################################################################################################################################################

            print ('For feature: ' +str(currentFeatureNumber)+ '/'+str(len(feature_list))+ ' (' + feature_id + '): ' + str(snpQuery.shape[0]) + ' SNPs need to be tested.\n Please stand by.')

            ##########################################################################################################################################################
            # SNP TESTING
            ##########################################################################################################################################################
            
            if(n_perm!=0):
                bestPermutationPval = np.ones((n_perm), dtype=np.float)
            
            #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix.
            #test if the covariates, kinship, snp and phenotype are in the same order
            if ((all(kinship_df.loc[sample2individual_df['iid'],sample2individual_df['iid']].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\
                 (all(phenotype_ds.index==randomeff_df.loc[sample2individual_feature['sample'],sample2individual_feature['sample']].index) if randomeff_df is not None else True) &\
                 (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index) if covariate_df is not None else True)):
                '''
                if all lines are in order put in arrays the correct genotype and phenotype
                x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b;                 better readability of the code
                '''
                #######################################################################################################################################################
                # look if kinship or other random effect dataframe are present and QS computation !
                
                if debugger:
                    fun_start = time.time()
                if kinship_df is not None and randomeff_df is None:
                    kinship_mat = kinship_df.loc[individual_ids,individual_ids].values
                    kinship_mat = kinship_mat.astype(float)

                    ##GOWER normalization of Kinship matrix.
                    kinship_mat *= (kinship_mat.shape[0] - 1) / (kinship_mat.trace() - kinship_mat.mean(0).sum())
                    ## This needs to go with the subselection stuff.
                    if(QS is None and not contains_missing_samples):
                        QS = economic_qs(kinship_mat)
                    elif (contains_missing_samples):
                        QS = economic_qs(kinship_mat)
                # combining the two matrices
                if kinship_df is not None and randomeff_df is not None:
                    #Here we need to match names and make sure that the order is the same and the right samples get mixed.
                    randomeff_mix = True
                    if(not Sigma_qs and not contains_missing_samples):
                        kinship_mat = kinship_df.loc[individual_ids,individual_ids].values
                        kinship_mat = kinship_mat.astype(float)
                        
                        randomeff_mat = randomeff_df.loc[sample2individual_feature['sample'],sample2individual_feature['sample']].values
                        randomeff_mat = randomeff_mat.astype(float)
                        for rho in rho1:
                            Sigma[rho] = rho * kinship_mat + (1 - rho) * randomeff_mat
                            Sigma[rho] *= (Sigma[rho].shape[0] - 1) / (Sigma[rho].trace() - Sigma[rho].mean(0).sum())
                            Sigma_qs[rho] = economic_qs(Sigma[rho])

                    elif (contains_missing_samples):
                        #To fix: this needs to be reset after running with missing samples. Now Missing!!
                        kinship_mat = kinship_df.loc[individual_ids,individual_ids].values
                        kinship_mat = kinship_mat.astype(float)

                        randomeff_mat = randomeff_df.loc[sample2individual_feature['sample'],sample2individual_feature['sample']].values
                        randomeff_mat = randomeff_mat.astype(float)

                        for rho in rho1:
                            Sigma[rho] = rho * kinship_mat + (1 - rho) * randomeff_mat
                            ##GOWER normalization of Kinship matrix.
                            Sigma[rho] *= (Sigma[rho].shape[0] - 1) / (Sigma[rho].trace() - Sigma[rho].mean(0).sum())
                            Sigma_qs[rho] = economic_qs(Sigma[rho])

                # if kinship_df is None and randomeff_df is not None: 
                #     randomeff_mat = randomeff_df.loc[individual_ids,individual_ids].values
                #     randomeff_mat = randomeff_mat.astype(float)

                #     ##GOWER normalization of Kinship matrix.
                #     randomeff_mat *= (randomeff_mat.shape[0] - 1) / (randomeff_mat.trace() - randomeff_mat.mean(0).sum())
                #     ## This needs to go with the subselection stuff.
                #     if(QS is None and not contains_missing_samples):
                #         QS = economic_qs(randomeff_mat)
                #     elif (contains_missing_samples):
                #         QS = economic_qs(randomeff_mat)

                # creating a fake QS if none random effect is present or use the read depth one
                if kinship_df is None:
                    if randomeff_df is None:
                        K = np.eye(len(phenotype_ds.index))
                        if(QS is None and not contains_missing_samples):
                            QS = economic_qs(K)
                        elif (contains_missing_samples):
                            QS = economic_qs(K)
                    else:
                        if(QS is None and not contains_missing_samples):
                            QS = economic_qs(randomeff_df)
                        elif (contains_missing_samples):
                            QS = economic_qs(randomeff_df)

                if debugger:
                    fun_end = time.time()
                    print(" Computing QS took {}".format(fun_end-fun_start))
                #######################################################################################################################################################

                #######################################################################################################################################################
                # covariance matrix setting
                cov_matrix =  covariate_df.loc[sample2individual_feature['sample'],:].values if covariate_df is not None else None
                if covariate_df is None:
                    cov_matrix = np.ones((len(individual_ids), 1))
                if snp_cov_df is not None:
                    snp_cov_df_tmp = snp_cov_df.loc[individual_ids,:]
                    snp_cov_df_tmp.index=sample2individual_feature['sample']
                    snp_cov_df = pd.DataFrame(fill_NaN.fit_transform(snp_cov_df_tmp))
                    snp_cov_df.index=snp_cov_df_tmp.index
                    snp_cov_df.columns=snp_cov_df_tmp.columns
                    cov_matrix = np.concatenate((cov_matrix,snp_cov_df.values),1)
                    snp_cov_df_tmp = None
                    snp_cov_df = None
                cov_matrix = cov_matrix.astype(float)
                #######################################################################################################################################################
            else:
                print ('There is an issue in mapping phenotypes vs covariates and/or kinship')
                sys.exit()

            ###########################################################################################################################################################
            # force normal distribution of expression values
            phenotype_ds = pd.Series(data= utils.force_normal_distribution(phenotype_ds.values,method=gaussianize_method) if gaussianize_method is not None else phenotype_ds.values,index=phenotype_ds.index,name=phenotype_ds.name)
            ###########################################################################################################################################################
            
            ##########################################################################################################################################################
            # Regressing up covariates
            if debugger:
                fun_start = time.time()
            if regressCovariatesUpfront:
                phenotype = phenotype_ds.values
                ##Using LMM/LM to regress covariates up front. 
                # Computing Null Model 
                if debugger:
                    fun_start = time.time()
                if randomeff_mix:
                    mixingParameters = utils.rhoTest(None, phenotype,cov_matrix,Sigma_qs,mixed, None)
                    lmm = mixingParameters["lmm"]
                    log[(feature_id)].append(mixingParameters["rho"])
                    feature_best_rho = mixingParameters["rho"]
                
                    if mixingParameters["rho"]!=0:
                        print("Random effect has influence, mixing parameter: "+str(mixingParameters["rho"]))
                    else :
                        print("Only kinship has effect.")
                    
                else:
                    lmm = LMM(phenotype, cov_matrix, QS)
                    if not mixed:
                         lmm.delta = 1
                         lmm.fix('delta')
                    lmm.fit(verbose=False)
                if debugger:
                    fun_end = time.time()
                    print(" Computing Null model took {}".format(fun_end-fun_start))
                #Replace phenotype with corrected phenotype:
                phenotype_ds = pd.Series(data= (phenotype-cov_matrix[:,1:].dot(lmm.beta[1:])),index=phenotype_ds.index,name=phenotype_ds.name)
            
            if debugger:
                fun_end = time.time()
                print(" Regressing Covariates took {}".format(fun_end-fun_start))
            ##########################################################################################################################################################
            if debugger:
                fun_start = time.time()
            ##########################################################################################################################################################
            # Fast scanning - iterate according to a blocksize 
            countChunker = 0
            
            for snpGroup in utils.chunker(snpQuery, blocksize):
                countChunker=countChunker+1
                #print(countChunker)
                #Fix seed at the start of the first chunker so all permutations are based on the same random first split.
                np.random.seed(seed)
                #print(snpGroup)
                snp_idxs = snpGroup['i'].values
                snp_names = snpGroup['snp'].values

                tested_snp_ids.extend(snp_names)
                #subset genotype matrix, we cannot subselect at the same time, do in two steps.
                if debugger:
                    fun_start = time.time()
                ##########################################################################################################################################################
                # SNP dataframe creation
                if(plinkGenotype):
                    snp_df = pd.DataFrame(data=bed[snp_idxs,:].compute().transpose(),index=fam.index,columns=snp_names)
                else :
                    snp_df_dosage = pd.DataFrame(np.nan,index=fam.index, columns = snp_names)
                    snp_df = pd.DataFrame(np.nan,index=fam.index, columns = snp_names)
                    rowNumber = 0
                    for snpId in snp_idxs :
                        geno = bgen["genotype"][snpId].compute()
                        if (geno["ploidy"].min()>1 & geno["ploidy"].max()<3) :
                            if(geno["phased"]):
                                snp_df_dosage_t = geno["probs"][:,[0,2]].sum(1).astype(float)
                                snp_df_t = (np.abs(np.argmax(geno["probs"][:,:2], axis=1)-1)+np.abs(np.argmax(geno["probs"][:,2:4], axis=1)-1)).astype(float)
                                naId = (np.amax(geno["probs"][:,:2],1)+np.amax(geno["probs"][:,2:4],1))<(1+minimumProbabilityStep)
                                snp_df_dosage_t[naId] = float('NaN')
                                snp_df_t[naId] = float('NaN')
                            else :
                                snp_df_dosage_t = ((geno["probs"][:,0]* 2)+geno["probs"][:,1]).astype(float)
                                snp_df_t = (np.abs(np.argmax(geno["probs"][:,:3], axis=1)-2)).astype(float)
                                naId = np.amax(geno["probs"][:,:3],1)<((1/3)+minimumProbabilityStep)
                                snp_df_dosage_t[naId] = float('NaN')
                                snp_df_t[naId] = float('NaN')
                            snp_df_dosage.loc[:,snp_names[rowNumber]] = snp_df_dosage_t
                            snp_df.loc[:,snp_names[rowNumber]] = snp_df_t
                        rowNumber = rowNumber +1
                    snp_df_dosage = snp_df_dosage.loc[individual_ids,:]
                
                snp_df = snp_df.loc[individual_ids,:]
            
                snp_df = snp_df.loc[:,np.unique(snp_df.columns)[np.unique(snp_df.columns,return_counts=1)[1]==1]]
                if debugger:
                    fun_end = time.time()
                    print(" Subsetting genotype matrix took {}".format(fun_end-fun_start))

                ##########################################################################################################################################################
                #SNP QC.
                if debugger:
                    fun_start = time.time()
                if not contains_missing_samples:
                    #remove SNPs from snp_df if they have previously failed QC
                    snp_df = snp_df.loc[:,snp_df.columns[~snp_df.columns.isin(fail_qc_snps_all)]]
                    if snp_df.shape[1] == 0:
                        continue
                    snps_to_test_df = snp_df.loc[:,snp_df.columns[~snp_df.columns.isin(pass_qc_snps_all)]]
                    if snps_to_test_df.shape[1] > 0:
                        #Only do QC on relevant SNPs. join pre-QCed list and new QCed list.
                        if kinship_df is not None:
                            passed_snp_names,failed_snp_names,call_rate,maf,hweP = do_snp_qc(snps_to_test_df.iloc[np.unique(snps_to_test_df.index,return_index=1)[1]].loc[geneticaly_unique_individuals,:], min_call_rate, min_maf, min_hwe_P)
                        else:
                            passed_snp_names,failed_snp_names,call_rate,maf,hweP = do_snp_qc(snps_to_test_df, min_call_rate, min_maf, min_hwe_P)
                        snps_to_test_df = None
                        #append snp_names and failed_snp_names
                        pass_qc_snps_all.extend(passed_snp_names)
                        fail_qc_snps_all.extend(failed_snp_names)
                    snp_df = snp_df.loc[:,snp_df.columns[snp_df.columns.isin(pass_qc_snps_all)]]
                else:
                    #Do snp QC for relevant section.
                    #Get relevant slice from: phenotype_ds
                    if kinship_df is not None:
                        passed_snp_names,failed_snp_names,call_rate,maf,hweP = do_snp_qc(snp_df.iloc[np.unique(snp_df.index,return_index=1)[1]].loc[geneticaly_unique_individuals,:], min_call_rate, min_maf, min_hwe_P)
                    else:
                        passed_snp_names,failed_snp_names,call_rate,maf,hweP = do_snp_qc(snp_df, min_call_rate, min_maf, min_hwe_P)
                    snp_df = snp_df.loc[:,snp_df.columns[snp_df.columns.isin(passed_snp_names)]]
                snpQcInfo_t = None
                if call_rate is not None:
                    snpQcInfo_t = call_rate
                    if maf is not None:
                        snpQcInfo_t = pd.concat([snpQcInfo_t,maf.reindex(snpQcInfo_t.index)],axis=1)
                        if hweP is not None:
                            snpQcInfo_t = pd.concat([snpQcInfo_t,hweP.reindex(snpQcInfo_t.index)],axis=1)
                if debugger:
                    fun_end = time.time()
                    print(" SNP quality control took {}".format(fun_end-fun_start))
                ##########################################################################################################################################################

                call_rate = None
                maf = None
                hweP = None
                
                
                if snpQcInfo is None and snpQcInfo_t is not None:
                    snpQcInfo = snpQcInfo_t
                elif snpQcInfo_t is not None:
                    snpQcInfo = pd.concat([snpQcInfo, snpQcInfo_t], axis=0, sort = False)
                ##First process SNPQc than check if si can continue.
                if len(snp_df.columns) == 0:
                    continue
                ##If we use bgen we replace the genotypes here to only have the dosage matrix in mem. Trying to save some memory.
                if (not plinkGenotype):
                    snp_df= snp_df_dosage.loc[:,np.unique(snp_df.columns)]
                    snp_df_dosage = None
                #We could make use of relatedness when imputing.  And impute only based on genetically unique individuals.
                snp_df = pd.DataFrame(fill_NaN.fit_transform(snp_df),index=snp_df.index,columns=snp_df.columns)
                ##No more snp_matrix_DF > snp_df
#                test if the covariates, kinship, snp and phenotype are in the same order
                if (len(snp_df.index) != len(sample2individual_feature.loc[phenotype_ds.index]['iid']) or not all(snp_df.index==sample2individual_feature.loc[phenotype_ds.index]['iid'])):
                    print ('There is an issue in mapping phenotypes and genotypes')
                    sys.exit()

                ##########################################################################################################################################################
                # SCANNING
                if debugger:
                    fun_start = time.time()
                
                rho = [None] * snp_df.shape[1]
                pVal = [None] * snp_df.shape[1]
                if pearson: 
                    for snpPos in range(0, snp_df.shape[1]) :
                        rho[snpPos], pVal[snpPos] = sp.stats.pearsonr(snp_df.values[:,snpPos], phenotype_ds.values)
                else: 
                    for snpPos in range(0, snp_df.shape[1]) :
                        rho[snpPos], pVal[snpPos] = sp.stats.spearmanr(snp_df.values[:,snpPos], phenotype_ds.values)
                
                if debugger:
                    fun_end = time.time()
                    print(" Actual scanning took {}".format(fun_end-fun_start))
                #########################################################################################################################################################
                
                #add these results to qtl_results
                temp_df = pd.DataFrame(index = range(len(snp_df.columns)),columns=['feature_id','snp_id','p_value','beta','beta_se','empirical_feature_p_value'])
                temp_df['snp_id'] = snp_df.columns
                temp_df['feature_id'] = feature_id.replace("/","-")
                temp_df['beta'] = np.asarray(rho)
                temp_df['p_value'] = np.asarray(pVal)
                
                #insert default dummy value
                temp_df['beta_se'] = None
                temp_df['empirical_feature_p_value'] = -1.0
                
                ##########################################################################################################################################################
                # SCANNING
                if debugger:
                    fun_start = time.time()
                if(n_perm!=0):
                    pValueBuffer = []
                    totalSnpsToBeTested = (snp_df.shape[1]*n_perm)
                    permutationStepSize = np.floor(n_perm/(totalSnpsToBeTested/blocksize))
                    if(permutationStepSize>n_perm):
                        permutationStepSize=n_perm
                    elif(permutationStepSize<1):
                        permutationStepSize=1
                    
                    if(write_permutations):
                        print("Not supported.")
                        #perm_df = pd.DataFrame(index = range(len(snp_df.columns)),columns=['snp_id'] + ['permutation_'+str(x) for x in range(n_perm)])
                        #perm_df['snp_id'] = snp_df.columns
                    for currentNperm in utils.chunker(list(range(1, n_perm+1)), permutationStepSize):
                        
                        if (kinship_df is not None) and (relatedness_score is not None):
                            temp = utils.get_shuffeld_genotypes_preserving_kinship(geneticaly_unique_individuals, relatedness_score, snp_df,kinship_df.loc[individual_ids,individual_ids], len(currentNperm))
                        else:
                            temp = utils.get_shuffeld_genotypes(snp_df, len(currentNperm))
                        temp = temp.astype(float)
                        
                        var_pvalues_p = [None] * temp.shape[1]
                        if pearson: 
                            for snpPos in range(0, temp.shape[1]) :
                                rhoP, var_pvalues_p[snpPos] = sp.stats.pearsonr(temp[:,snpPos], phenotype_ds.values)
                        else : 
                            for snpPos in range(0, temp.shape[1]) :
                                rhoP, var_pvalues_p[snpPos] = sp.stats.spearmanr(temp[:,snpPos], phenotype_ds.values)
                        pValueBuffer.extend(np.asarray(var_pvalues_p))
                    if(not(len(pValueBuffer)==totalSnpsToBeTested)):
                        print(len(pValueBuffer))
                        print(pValueBuffer)
                        print(totalSnpsToBeTested)
                        print('Error in blocking logic for permutations.')
                        sys.exit()
                    perm = 0
                    for relevantOutput in utils.chunker(pValueBuffer,snp_df.shape[1]) :
                        #if(write_permutations):
                        #    perm_df['permutation_'+str(perm)] = relevantOutput
                        if(bestPermutationPval[perm] > min(relevantOutput)):
                            bestPermutationPval[perm] = min(relevantOutput)
                        perm = perm+1
                        #print(relevantOutput)
                        #print('permutation_'+str(perm))

                if not temp_df.empty :
                    data_written = True
                    output_writer.add_result_df(temp_df)
                    #if(write_permutations):
                    #    permutation_writer.add_permutation_results_df(perm_df,feature_id)
                if debugger:
                    fun_end = time.time()
                    print(" Permutations took {}".format(fun_end-fun_start))
        #This we need to change in the written file.
        if debugger:
            fun_start = time.time()
        
        if not data_written :
            fail_qc_features.append(feature_id)
        else:
            n_samples.append(phenotype_ds.size)
            n_e_samples.append(len(geneticaly_unique_individuals))
            if n_perm>1 :
                #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id);
                alpha_para, beta_para = output_writer.apply_pval_correction(feature_id.replace("/","-"),bestPermutationPval, cis_mode)
                if write_feature_top_permutations:
                    np.savetxt(output_dir+"/Permutation.pValues."+feature_id.replace("/","-")+".txt",bestPermutationPval)
                alpha_params.append(alpha_para)
                beta_params.append(beta_para)
            if randomeff_mix :
                random_eff_param.append(feature_best_rho)
        
        if contains_missing_samples:
            QS = None
            Sigma_qs = None
            geneticaly_unique_individuals = tmp_unique_individuals
            del tmp_unique_individuals
            if snpQcInfo is not None:
                snpQcInfo.index.name = "snp_id"
                snpQcInfo.to_csv(output_dir+'/snp_qc_metrics_naContaining_feature_{}.txt'.format(feature_id.replace("/","-")),sep='\t')
        else:
            if (snpQcInfo is not None and snpQcInfoMain is not None):
                snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo], axis=0, sort=False)
            elif snpQcInfo is not None :
                snpQcInfoMain = snpQcInfo.copy(deep=True)
        if debugger:
            fun_end = time.time()
            print(" Writing took {}".format(fun_end-fun_start))
        #if snpQcInfo is not None:
            #snpQcInfo2 = snpQcInfo.copy().transpose()
            #snpQcInfo2.to_csv(output_dir+'/snp_qc_metrics_feature_{}.txt'.format(feature_id),sep='\t')
        #print('step 5')
            
        print("Time: --- %s seconds ---" % (time.time() - start_time))
        tot_time += time.time() - start_time
        idx += 1
        print("Mean: --- %s seconds ---" % (tot_time/idx))
        log[(feature_id)].append((time.time() - start_time))
        log[(feature_id)].append(tot_time/idx)

    output_writer.close()

    if(write_permutations):
        permutation_writer.close()
    fail_qc_features = np.unique(fail_qc_features)
    if((len(feature_list)-len(fail_qc_features))==0):
        time.sleep(15)
        #Safety timer to make sure the file is unlocked.
        print("Trying to remove the h5 file. Nothing has been tested.")
        print(output_dir+'qtl_results_{}_{}_{}.h5'.format(chromosome,selectionStart,selectionEnd))
        if not selectionStart is None :
            os.remove(output_dir+'qtl_results_{}_{}_{}.h5'.format(chromosome,selectionStart,selectionEnd))
        else :
            os.remove(output_dir+'qtl_results_{}.h5'.format(chromosome))
        sys.exit()
    #gather unique indexes of tested SNPs

    tested_snp_ids = list(set(tested_snp_ids))
    #write annotation and snp data to file
    snp_df = pd.DataFrame()
    snp_df['snp_id'] = bim['snp']
    snp_df['chromosome'] = bim['chrom']
    snp_df['position'] = bim['pos']
    snp_df['assessed_allele'] = bim['a1']
    snp_df.index = snp_df['snp_id']
    snp_df = snp_df.drop_duplicates()
    snp_df = snp_df.reindex(tested_snp_ids)
    snp_df = snp_df.drop_duplicates()

    if snpQcInfoMain is not None :
        snpQcInfoMain['index']=snpQcInfoMain.index
        snpQcInfoMain = snpQcInfoMain.drop_duplicates()
        del snpQcInfoMain['index']
        snp_df = pd.concat([snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1)

        if(snp_df.shape[1]==5):
            snp_df.columns = ['snp_id','chromosome','position','assessed_allele','call_rate']
        elif(snp_df.shape[1]==6):
            snp_df.columns = ['snp_id','chromosome','position','assessed_allele','call_rate','maf']
        else :
            snp_df.columns = ['snp_id','chromosome','position','assessed_allele','call_rate','maf','hwe_p']

    feature_list = list(set(feature_list) - set(fail_qc_features))
    annotation_df = annotation_df.reindex(feature_list)
    annotation_df['n_samples'] = n_samples
    annotation_df['n_e_samples'] = n_e_samples

    if(n_perm>1):
        annotation_df['alpha_param'] = alpha_params
        annotation_df['beta_param'] = beta_params
    
    if randomeff_mix:
        annotation_df['rho'] = random_eff_param

    if not selectionStart is None :
        snp_df.to_csv(output_dir+'/snp_metadata_{}_{}_{}.txt'.format(chromosome,selectionStart,selectionEnd),sep='\t',index=False)
        annotation_df.to_csv(output_dir+'/feature_metadata_{}_{}_{}.txt'.format(chromosome,selectionStart,selectionEnd),sep='\t')
    else :
        snp_df.to_csv(output_dir+'/snp_metadata_{}.txt'.format(chromosome),sep='\t',index=False)
        annotation_df.to_csv(output_dir+'/feature_metadata_{}.txt'.format(chromosome),sep='\t')

    if not selectionStart is None :
        print("saving log!")
        print(log)
        #pd.DataFrame.from_dict(log, orient="index", columns=["rho","start","mean"]).to_csv(output_dir + "/" + str(chromosome) + "_" + str(selectionStart) + "_" +  str(selectionEnd) + "_log_rho.txt", sep="\t")
    else:
        print("saving log!")