Example #1
0
def study_type(study):
    ancestor_dict = {}
    exps = v5.study_to_exps(study)
    for exp in exps:
        ancestors = v5.exp_to_celltypes(exp)
        for celltype in v5.exp_to_celltypes(exp):
            ancestors += co.get_ancestors(celltype)
        ancestor_dict[exp] = ancestors

    if len(exps) == 1:
        ancestors = ancestor_dict[exps[0]]
    else:
        first = True
        for exp in ancestor_dict:
            if first:
                ancestors = set(ancestor_dict[exp])
                first = False
                continue
            ancestors = ancestors | set(ancestor_dict[exp])
        ancestors = list(ancestors)

    common = []
    for term_id in co.get_terms_without_children(list(ancestors)):
        common.append(co.get_term_name(term_id))
    common = '; '.join(common)

    return common
def exp_to_celltypes(exp):
    """
    Gets specific labels for a given experiment.
    """
    type_ids = co.get_terms_without_children(type_file[exp])
    general_labels = [
        "CL:2000001", "CL:0000081", "CL:0002371", "CL:0000548", "CL:0000010"
    ]
    # PBMC, blood cell, somatic cell, animal cell, cultured cell
    # making an executive decision to exclude these cell types
    specific_list = [id for id in type_ids if id not in general_labels]
    if len(specific_list) == 0:  # if the only labels are "bad"
        specific_list = type_ids  #return original labels (some of which are bad)

    return specific_list  # used 6 times
def create_reference(index_list):
    """
    Given a set of studies from which to draw from, creates a reference matrix.
    """
    cell_types = []
    for i in index_list:
        cell_types += exp_to_celltypes(exp_acc[i])  # TEST
    if 'CL:0000236' in cell_types:
        print("Yes!")
    else:
        print("No!")

    cell_types = list(set(cell_types))
    leaves = co.get_terms_without_children(cell_types)

    if 'CL:0000236' in leaves:
        print("Yes! Two")
    else:
        print("No! Two")

    leaf_index = {}
    for leaf in leaves:
        leaf_index[leaf] = [
            exp_to_index(celltype) for celltype in celltype_to_exp(leaf)
        ]  # TEST

    # TODO: this could use some clarification
    # explain what's going on with column stack and why you picked it

    first = True
    for leaf in leaves:
        signatures = get_signatures(leaf_index[leaf])

        if len(leaf_index[leaf]) == 1:
            average = signatures
        else:
            average = np.mean(signatures, axis=1)

        if first:
            a = average
            first = False
        else:
            a = np.column_stack([a, average])

    leaves = [co.get_term_name(leaf) for leaf in leaves]

    return {"gene_ids": gene_ids, "reference": a, "cell_types": leaves}
def multi_core(cell_exp_count, studies, exp_acc, gene_ids, countspermillion,
               qualified_cell_type_name, cell_type_file, qualified_cell_type):
    exp_acc_list = list(exp_acc)
    cell_types_selected = qualified_cell_type
    # Construct the noise added reference matrix
    reference_matrix = []
    select_study_list = {}
    for i in qualified_cell_type:
        tmp_exp = V5.celltype_to_exp(i)
        select_sample = random.choice(tmp_exp)
        select_study_list[i] = V5.exp_to_study(select_sample)
        exp_index = exp_acc_list.index(select_sample)
        reference_matrix.append(exp_index)

    # print(select_study_list)
    # Build the noise added reference matrix
    for i in range(len(reference_matrix)):
        if i == 0:
            reference = countspermillion[reference_matrix[i]]
        else:
            tmp = countspermillion[reference_matrix[i]]
            reference = np.vstack((reference, tmp))

    reference_noise = reference

    # Build the reference matrix
    reference_noise_free = []
    for i in range(len(qualified_cell_type)):
        tmp_exp = V5.celltype_to_exp(qualified_cell_type[i])
        tmp_ref = []
        # Since all cell type will be included, therefore we can simply using the previous one`
        for j in tmp_exp:
            # Only one study wll be chosen to construct the noisy reference, therefore using !=
            if V5.exp_to_study(j) != select_study_list[qualified_cell_type[i]]:
                tmp_ref.append(exp_acc_list.index(j))

        for j in range(len(tmp_ref)):
            if j == 0:
                reference = countspermillion[tmp_ref[j]]
            else:
                tmp = countspermillion[tmp_ref[j]]
                reference = np.vstack((reference, tmp))

        if len(tmp_ref) > 1:
            ref_mean = np.mean(reference, axis=0)
        else:
            ref_mean = reference

        reference_noise_free.append(ref_mean)

    reference_noise_free_np = np.array(reference_noise_free)
    signature_np = np.transpose(reference_noise_free_np)
    reference_noise_np = reference_noise.copy()
    signature_noise_np = np.transpose(reference_noise_np)
    # signature_temp = signature_np.copy()

    # Transform to pandas
    signature_np = signature_np.transpose()
    signature_noise_np = signature_noise_np.transpose()

    signature_pd = pd.DataFrame(data=signature_np,
                                columns=gene_ids,
                                index=qualified_cell_type_name)
    signature_noise_np_pd = pd.DataFrame(
        data=signature_noise_np,
        columns=gene_ids,
        index=[co.get_term_name(i) for i in qualified_cell_type])

    # Save the signature and noisy signature for future analysis
    signature_pd.to_csv('~/IndependentStudy/Data/SignatureSimulation/' +
                        str(cell_exp_count) + '_signature.tsv',
                        sep='\t')
    signature_noise_np_pd.to_csv(
        '~/IndependentStudy/Data/SignatureSimulation/' + str(cell_exp_count) +
        '_signature_noise.tsv',
        sep='\t')

    # Build the variance data set
    # Eliminate the redundant cell type in all exp
    cell_type_specific_file = {}
    for i in cell_type_file:
        cell_type_specific_file[i] = co.get_terms_without_children(
            cell_type_file[i])

    # Build the exp to study check dictionary
    studyexpMap = {}
    expstudyMap = {}
    for i in range(len(exp_acc)):
        expstudyMap[exp_acc[i]] = studies[i]
        if studies[i] not in studyexpMap:
            studyexpMap[studies[i]] = [exp_acc[i]]
        else:
            studyexpMap[studies[i]].append(exp_acc[i])

    # Build the variance matrix
    variance_matrix = []
    cell_types_48 = []

    for cell_co in range(len(cell_types_selected)):
        # Get the cell type
        cellExpDict = {}
        for i in cell_type_specific_file:
            if cell_types_selected[cell_co] in cell_type_specific_file[i]:
                cellExpDict[i] = [cell_types_selected[cell_co]]

        # cell type specific Exp to Study dictionary
        expPerStudy = []
        keys = list(cellExpDict.keys())
        # print(keys)
        studyList = []
        for i in keys:
            if expstudyMap[i] not in studyList:
                studyList.append(expstudyMap[i])
                expPerStudy.append(i)
            else:
                continue

        tmp_exp_study = {}
        for i in cellExpDict.keys():
            if expstudyMap[i] not in tmp_exp_study.keys():
                tmp_exp_study[expstudyMap[i]] = [i]
            else:
                tmp_exp_study[expstudyMap[i]].append(i)

        # Get the within study variance
        # Generate the mean profile
        tmp_mean = []
        within_study_var = []
        # Build the exp expression matrix
        # print(tmp_exp_study.items())

        for j in tmp_exp_study.items():
            # print(select_study_list[cell_types_selected[cell_co]])
            # print(j[0])
            if j[0] not in select_study_list[cell_types_selected[cell_co]]:
                # Garb the cell index
                specific_cell_exp_index = []
                for i in range(len(exp_acc)):
                    if exp_acc[i] in j[1]:
                        specific_cell_exp_index.append(i)
                    else:
                        continue

                specific_cell_exp_signature = get_signatures(
                    specific_cell_exp_index, countspermillion)

                # Generate the cell_type specific mean (j[1] is a tuple), tmp_mean consist study mean
                if len(j[1]) == 1:
                    tmp_mean.append(specific_cell_exp_signature)
                else:
                    tmp_mean.append(
                        np.mean(specific_cell_exp_signature, axis=1))

                # Calculate the residue (if j[1] > 1)
                if len(j[1]) > 1:
                    tmp_residue_list = []
                    for index in specific_cell_exp_index:
                        tmp_exp = get_signatures([index], countspermillion)
                        tmp_residue = np.abs(
                            tmp_exp -
                            np.mean(specific_cell_exp_signature, axis=1))
                        tmp_residue_list.append(tmp_residue)

                    # Construct the within study variance
                    tmp_residue_list = np.array(tmp_residue_list)
                    within_study_var.append(np.var(tmp_residue_list, axis=0))
                else:
                    within_study_var.append(
                        np.zeros(specific_cell_exp_signature.shape[0]))
            else:
                continue

        cell_types_48 += tmp_mean
        within_study_var = np.array(within_study_var)

        # Construct the study variance
        tmp_mean = np.array(tmp_mean)
        study_variance = np.var(tmp_mean, axis=0)

        # We assume variance sum law here
        total_variance = np.zeros(study_variance.shape[0])
        total_variance = total_variance + study_variance

        for i in within_study_var:
            total_variance = total_variance + i

        variance_matrix.append(total_variance)

    variance_matrix = np.array(variance_matrix)
    print(variance_matrix.shape)
    os.system("touch " + '~/IndependentStudy/Data/Variance/' +
              str(cell_exp_count) + '_variance.txt')
    np.savetxt('/ua/shi235/IndependentStudy/Data/Variance/' +
               str(cell_exp_count) + '_variance.txt',
               variance_matrix,
               delimiter="\t")
def build_model():
    # Provided h5 file
    file_name = 'expression_CPM.h5'
    decon_temp = './decon_temp/'
    decon_temp_shell = "./decon_temp/"

    # Load h5 data
    cpm = h5py.File("expression_CPM.h5", 'r')
    studies = np.array(cpm.get('study')).astype(str)
    exp_acc = np.array(cpm.get('experiment_accession')).astype(str)
    gene_ids = np.array(cpm.get('gene')).astype(str)
    countspermillion = np.array(cpm.get('cpm'))
    cpm.close()

    with open('cell_types.json', 'r') as type_file:
        cell_type_file = json.load(type_file)

    # Eliminate the redundant cell type in all exp
    cell_type_specific_file = {}
    for i in cell_type_file:
        cell_type_specific_file[i] = co.get_terms_without_children(
            cell_type_file[i])

    cellExpDict = {}
    for i in cell_type_specific_file:
        if cell_type_specific_file[i] == ['CL:2000001']:
            cellExpDict[i] = ['CL:2000001']

    # print(cellExpDict)
    # Build the exp to study check dictionary
    studyexpMap = {}
    expstudyMap = {}
    for i in range(len(exp_acc)):
        expstudyMap[exp_acc[i]] = studies[i]
        if studies[i] not in studyexpMap:
            studyexpMap[studies[i]] = [exp_acc[i]]
        else:
            studyexpMap[studies[i]].append(exp_acc[i])

    # print(studyexpMap)
    # Generate data set with single cell type across different study, each study only take one experiment
    # Containing cell type 'CL:0001067', which is 'group 1 innate lymphoid cell'
    expPerStudy = []
    keys = list(cellExpDict.keys())
    studyList = []
    for i in keys:
        if expstudyMap[i] not in studyList:
            studyList.append(expstudyMap[i])
            expPerStudy.append(i)
        else:
            continue

    # print(len(keys))
    # Transform to the index
    specific_cell_exp = expPerStudy
    specific_cell_exp = set(specific_cell_exp)
    # Build the Blood_Platelets exp expression matrix
    specific_cell_exp_index = []
    for i in range(len(exp_acc)):
        if exp_acc[i] in specific_cell_exp:
            specific_cell_exp_index.append(i)
        else:
            pass

    # print(len(specific_cell_exp))
    specific_cell_exp_signature = get_signatures(specific_cell_exp_index)

    # Unsure, according to the words, we should not include those gene with 0 m
    temp_mean_x = np.mean(specific_cell_exp_signature, axis=1)

    x = np.log(np.mean(specific_cell_exp_signature, axis=1) +
               1)  # Confuse, ask later.
    y = np.log(np.std(specific_cell_exp_signature, axis=1))
    # print(len(x))
    # print(x)
    # print(y)

    # Nan and Inf value needs to be dropped, otherwise we cannnot use Guassian KDE to estimate the density.
    index = []
    for i in range(len(y)):
        #     if np.isnan(y[i]) or np.isinf(y[i]):
        #         index.append(i)
        if x[i] == 0:
            index.append(i)
    x = np.delete(x, index, axis=0)
    y = np.delete(y, index, axis=0)
    # print(len(x))
    # print(x)
    # print(y)

    fig, ax1 = plt.subplots()
    ax1.scatter(x, y, alpha=0.6)
    ax1.set_title("Std-Mean Plot in Log Space")
    ax1.set_ylabel("log(STD)")
    ax1.set_xlabel("log(MeanCPM+1)")
    # plt.savefig("CV_mean_sp1.png")

    x_index = x.argsort()
    estimatex, estimatey = x[x_index], y[x_index]

    # Build a dictionary to record all x associated with y
    xyDict = {}
    for i in range(len(estimatex)):
        if estimatex[i] not in xyDict:
            xyDict[estimatex[i]] = [estimatey[i]]
        else:
            xyDict[estimatex[i]].append(estimatey[i])

    # We randomly select a CV for corresponding expression level
    interpolatex = list(set(estimatex))
    interpolatey = []
    for i in interpolatex:
        temp = random.randint(0, len(xyDict[i])) - 1
        interpolatey.append(xyDict[i][temp])

    interpolatex = np.array(interpolatex)
    interpolatey = np.array(interpolatey)
    idx = interpolatex.argsort()
    x, y = interpolatex[idx], interpolatey[idx]
    sp1 = UnivariateSpline(x, y, k=1)

    return sp1
def cell_type_variance_model():

    # Gene Specific model
    file_name = 'expression_CPM.h5'
    decon_temp = './decon_temp/'
    decon_temp_shell = "./decon_temp/"

    # Load h5 data
    cpm = h5py.File("expression_CPM.h5", 'r')
    studies = np.array(cpm.get('study')).astype(str)
    exp_acc = np.array(cpm.get('experiment_accession')).astype(str)
    gene_ids = np.array(cpm.get('gene')).astype(str)
    countspermillion = np.array(cpm.get('cpm'))
    cpm.close()

    with open('cell_types.json', 'r') as type_file:
        cell_type_file = json.load(type_file)

    # Eliminate the redundant cell type in all exp
    cell_type_specific_file = {}
    for i in cell_type_file:
        cell_type_specific_file[i] = co.get_terms_without_children(
            cell_type_file[i])

    # Build the exp to study check dictionary
    studyexpMap = {}
    expstudyMap = {}
    for i in range(len(exp_acc)):
        expstudyMap[exp_acc[i]] = studies[i]
        if studies[i] not in studyexpMap:
            studyexpMap[studies[i]] = [exp_acc[i]]
        else:
            studyexpMap[studies[i]].append(exp_acc[i])

    cell_types_selected = [
        'CL:1000274', 'CL:0002618', 'CL:0000501', 'CL:0000765', 'CL:2000001',
        'CL:0002341', 'CL:0000583', 'CL:0000127', 'CL:0002631', 'CL:0000936',
        'CL:0002327', 'CL:0000023', 'CL:0000216', 'CL:0000557', 'CL:0000018',
        'CL:0000905', 'CL:0000182', 'CL:0000895', 'CL:0000096', 'CL:0002340',
        'CL:0011001', 'CL:0000050', 'CL:0002633', 'CL:0000232', 'CL:0000019',
        'CL:0000792', 'CL:0002063', 'CL:0000836', 'CL:0000904', 'CL:0002399',
        'CL:0000233', 'CL:0002038', 'CL:0000788', 'CL:0000900', 'CL:0000670',
        'CL:0002057', 'CL:0000351', 'CL:0001069', 'CL:0000091', 'CL:0000359',
        'CL:0010004', 'CL:0000171', 'CL:0000169', 'CL:0000017', 'CL:0000623',
        'CL:0001057', 'CL:0002394', 'CL:0000129'
    ]

    variance_matrix = []
    cell_types_48 = []
    for cell_co in cell_types_selected:
        # Get the cell type
        cellExpDict = {}
        for i in cell_type_specific_file:
            if cell_co in cell_type_specific_file[i]:
                cellExpDict[i] = [cell_co]

        # cell type specific Exp to Study dictionary
        expPerStudy = []
        keys = list(cellExpDict.keys())
        studyList = []
        for i in keys:
            if expstudyMap[i] not in studyList:
                studyList.append(expstudyMap[i])
                expPerStudy.append(i)
            else:
                continue

        tmp_exp_study = {}
        for i in cellExpDict.keys():
            if expstudyMap[i] not in tmp_exp_study.keys():
                tmp_exp_study[expstudyMap[i]] = [i]
            else:
                tmp_exp_study[expstudyMap[i]].append(i)

        # Get the within study variance
        # Generate the mean profile
        tmp_mean = []
        within_study_var = []
        # Build the exp expression matrix
        for j in tmp_exp_study.items():

            # Garb the cell index
            specific_cell_exp_index = []
            for i in range(len(exp_acc)):
                if exp_acc[i] in j[1]:
                    specific_cell_exp_index.append(i)
                else:
                    continue

            specific_cell_exp_signature = get_signatures(
                specific_cell_exp_index)

            # Generate the cell_type specific mean (j[1] is a tuple), tmp_mean consist study mean
            if len(j[1]) == 1:
                tmp_mean.append(specific_cell_exp_signature)
            else:
                tmp_mean.append(np.mean(specific_cell_exp_signature, axis=1))

            # Calculate the residue (if j[1] > 1)
            if len(j[1]) > 1:
                tmp_residue_list = []
                for index in specific_cell_exp_index:
                    tmp_exp = get_signatures([index])
                    tmp_residue = np.abs(
                        tmp_exp - np.mean(specific_cell_exp_signature, axis=1))
                    tmp_residue_list.append(tmp_residue)

                # Construct the within study variance
                tmp_residue_list = np.array(tmp_residue_list)
                within_study_var.append(np.var(tmp_residue_list, axis=0))
            else:
                within_study_var.append(
                    np.zeros(specific_cell_exp_signature.shape[0]))

        cell_types_48 += tmp_mean
        within_study_var = np.array(within_study_var)

        # Construct the study variance
        tmp_mean = np.array(tmp_mean)
        study_variance = np.var(tmp_mean, axis=0)

        # We assume variance sum law here
        total_variance = np.zeros(study_variance.shape[0])
        total_variance = total_variance + study_variance

        for i in within_study_var:
            total_variance = total_variance + i

        variance_matrix.append(total_variance)

    variance_matrix = np.array(variance_matrix)

    variance_matrix_sum = np.sum(variance_matrix, axis=0)

    x = np.log(np.mean(cell_types_48, axis=0) + 1)  # Confuse, ask later.
    y = np.log(np.sqrt(variance_matrix_sum) + 1)

    #   Nan and Inf value needs to be dropped, otherwise we cannnot use Guassian KDE to estimate the density.
    index = []
    for i in range(len(y)):
        if x[i] == 0:
            index.append(i)
    x = np.delete(x, index, axis=0)
    y = np.delete(y, index, axis=0)

    x_index = x.argsort()
    estimatex, estimatey = x[x_index], y[x_index]

    # Build a dictionary to record all x associated with y
    xyDict = {}
    for i in range(len(estimatex)):
        if estimatex[i] not in xyDict:
            xyDict[estimatex[i]] = [estimatey[i]]
        else:
            xyDict[estimatex[i]].append(estimatey[i])

    # We randomly select a CV for corresponding expression level
    interpolatex = list(set(estimatex))
    interpolatey = []
    for i in interpolatex:
        temp = random.randint(0, len(xyDict[i])) - 1
        interpolatey.append(xyDict[i][temp])

    interpolatex = np.array(interpolatex)
    interpolatey = np.array(interpolatey)
    idx = interpolatex.argsort()
    x, y = interpolatex[idx], interpolatey[idx]
    sp1 = UnivariateSpline(x, y, k=5)

    return sp1
def cell_type_variance():

    # Gene Specific model
    file_name = 'expression_CPM.h5'
    decon_temp = './decon_temp/'
    decon_temp_shell = "./decon_temp/"

    # Load h5 data
    cpm = h5py.File("expression_CPM.h5", 'r')
    studies = np.array(cpm.get('study')).astype(str)
    exp_acc = np.array(cpm.get('experiment_accession')).astype(str)
    gene_ids = np.array(cpm.get('gene')).astype(str)
    countspermillion = np.array(cpm.get('cpm'))
    cpm.close()

    with open('cell_types.json', 'r') as type_file:
        cell_type_file = json.load(type_file)

    # Eliminate the redundant cell type in all exp
    cell_type_specific_file = {}
    for i in cell_type_file:
        cell_type_specific_file[i] = co.get_terms_without_children(
            cell_type_file[i])

    # Build the exp to study check dictionary
    studyexpMap = {}
    expstudyMap = {}
    for i in range(len(exp_acc)):
        expstudyMap[exp_acc[i]] = studies[i]
        if studies[i] not in studyexpMap:
            studyexpMap[studies[i]] = [exp_acc[i]]
        else:
            studyexpMap[studies[i]].append(exp_acc[i])

    cell_types_selected = [
        'CL:1000274', 'CL:0002618', 'CL:0000501', 'CL:0000765', 'CL:2000001',
        'CL:0002341', 'CL:0000583', 'CL:0000127', 'CL:0002631', 'CL:0000936',
        'CL:0002327', 'CL:0000023', 'CL:0000216', 'CL:0000557', 'CL:0000018',
        'CL:0000905', 'CL:0000182', 'CL:0000895', 'CL:0000096', 'CL:0002340',
        'CL:0011001', 'CL:0000050', 'CL:0002633', 'CL:0000232', 'CL:0000019',
        'CL:0000792', 'CL:0002063', 'CL:0000836', 'CL:0000904', 'CL:0002399',
        'CL:0000233', 'CL:0002038', 'CL:0000788', 'CL:0000900', 'CL:0000670',
        'CL:0002057', 'CL:0000351', 'CL:0001069', 'CL:0000091', 'CL:0000359',
        'CL:0010004', 'CL:0000171', 'CL:0000169', 'CL:0000017', 'CL:0000623',
        'CL:0001057', 'CL:0002394', 'CL:0000129'
    ]

    variance_matrix = []
    cell_types_48 = []
    for cell_co in cell_types_selected:
        # Get the cell type
        cellExpDict = {}
        for i in cell_type_specific_file:
            if cell_co in cell_type_specific_file[i]:
                cellExpDict[i] = [cell_co]

        # cell type specific Exp to Study dictionary
        expPerStudy = []
        keys = list(cellExpDict.keys())
        studyList = []
        for i in keys:
            if expstudyMap[i] not in studyList:
                studyList.append(expstudyMap[i])
                expPerStudy.append(i)
            else:
                continue

        tmp_exp_study = {}
        for i in cellExpDict.keys():
            if expstudyMap[i] not in tmp_exp_study.keys():
                tmp_exp_study[expstudyMap[i]] = [i]
            else:
                tmp_exp_study[expstudyMap[i]].append(i)

        # Get the within study variance
        # Generate the mean profile
        tmp_mean = []
        within_study_var = []
        # Build the exp expression matrix
        for j in tmp_exp_study.items():

            # Garb the cell index
            specific_cell_exp_index = []
            for i in range(len(exp_acc)):
                if exp_acc[i] in j[1]:
                    specific_cell_exp_index.append(i)
                else:
                    continue

            specific_cell_exp_signature = get_signatures(
                specific_cell_exp_index)

            # Generate the cell_type specific mean (j[1] is a tuple), tmp_mean consist study mean
            if len(j[1]) == 1:
                tmp_mean.append(specific_cell_exp_signature)
            else:
                tmp_mean.append(np.mean(specific_cell_exp_signature, axis=1))

            # Calculate the residue (if j[1] > 1)
            if len(j[1]) > 1:
                tmp_residue_list = []
                for index in specific_cell_exp_index:
                    tmp_exp = get_signatures([index])
                    tmp_residue = np.abs(
                        tmp_exp - np.mean(specific_cell_exp_signature, axis=1))
                    tmp_residue_list.append(tmp_residue)

                # Construct the within study variance
                tmp_residue_list = np.array(tmp_residue_list)
                within_study_var.append(np.var(tmp_residue_list, axis=0))
            else:
                within_study_var.append(
                    np.zeros(specific_cell_exp_signature.shape[0]))

        cell_types_48 += tmp_mean
        within_study_var = np.array(within_study_var)

        # Construct the study variance
        tmp_mean = np.array(tmp_mean)
        study_variance = np.var(tmp_mean, axis=0)

        # We assume variance sum law here
        total_variance = np.zeros(study_variance.shape[0])
        total_variance = total_variance + study_variance

        for i in within_study_var:
            total_variance = total_variance + i

        variance_matrix.append(total_variance)

    variance_matrix = np.array(variance_matrix)
    # print(variance_matrix.shape)
    # return np.sum(variance_matrix, axis=0)
    return variance_matrix
# Load h5 data
cpm = h5py.File("expression_CPM.h5", 'r')
studies = np.array(cpm.get('study')).astype(str)
exp_acc = np.array(cpm.get('experiment_accession')).astype(str)
gene_ids = np.array(cpm.get('gene')).astype(str)
countspermillion = np.array(cpm.get('cpm'))
cpm.close()

with open('cell_types.json', 'r') as type_file:
    cell_type_file = json.load(type_file)

# Eliminate the redundant cell type in all exp
cell_type_specific_file = {}
for i in cell_type_file:
    cell_type_specific_file[i] = co.get_terms_without_children(
        cell_type_file[i])

cellExpDict = {}
for i in cell_type_specific_file:
    if cell_type_specific_file[i] == ['CL:2000001']:
        cellExpDict[i] = ['CL:2000001']

# Build the exp to study check dictionary
studyexpMap = {}
expstudyMap = {}
for i in range(len(exp_acc)):
    expstudyMap[exp_acc[i]] = studies[i]
    if studies[i] not in studyexpMap:
        studyexpMap[studies[i]] = [exp_acc[i]]
    else:
        studyexpMap[studies[i]].append(exp_acc[i])