from util import Timer
'''
This script is the baseline that we later compare to.
It loops over genes and performs score-tests, using pre-computed indicator variables (0/1)
'''

# covariates loader
covariatesloader = CovariatesLoaderCSV(
    snakemake.params.phenotype,
    snakemake.input.covariates_tsv,
    snakemake.params.covariate_column_names,
    sep='\t',
    path_to_phenotypes=snakemake.input.phenotypes_tsv)

# set up burden loaders
bloader_lof = BurdenLoaderHDF5(snakemake.input.h5_lof, snakemake.input.iid_lof,
                               snakemake.input.gid_lof)
bloader_missense = BurdenLoaderHDF5(snakemake.input.h5_missense,
                                    snakemake.input.iid_missense,
                                    snakemake.input.gid_missense)

# make sure individuals are in the same order
bloader_lof.update_individuals(covariatesloader.get_iids())
bloader_missense.update_individuals(covariatesloader.get_iids())

# gene names to iterate over
genes = np.union1d(bloader_lof.get_vids(), bloader_missense.get_vids())
if isinstance(genes, str):
    genes = [genes]

# set up the null model
Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('NoK')
    regions.columns = ['chrom', 'start', 'end', 'name']

    # discard all genes for which we don't have annotations
    regions['gene'] = regions.name.str.split('_', expand=True)[0]
    regions.set_index('gene', inplace=True)
    genes = intersect_ids(np.unique(regions.index.values), np.unique(eveploader.pos_df.gene))
    regions = regions.loc[genes].reset_index()
    regions = regions.sort_values(['chrom','start','end'])[['chrom','start','end','name','gene']]

    # set up the variant loader (missense variants) for the chromosome
    plinkloader = VariantLoaderSnpReader(Bed(bed, count_A1=True, num_threads=4))
    plinkloader.update_variants(eveploader.get_vids())
    plinkloader.update_individuals(covariatesloader.get_iids())
    
    # set up the protein LOF burden loader
    bloader_lof = BurdenLoaderHDF5(h5_lof, iid_lof, gid_lof)
    bloader_lof.update_individuals(covariatesloader.get_iids())

    # set up local collapsing
    collapser = LocalCollapsing(distance_threshold=1.)

    # set up the missense genotype + vep loading function
    def get_missense(interval):

        try:
            V1 = eveploader.anno_by_interval(interval, gene=interval['name'].split('_')[0])
        except KeyError:
            raise GotNone

        if V1.index.empty:
            raise GotNone