def expression_correlation(gene_types, expression_data): corr_list = [] # pvalue_list = [] for data_source_one, data_source_two in itertools.combinations( gene_types, 2): exp1 = data_source_one['id'] exp2 = data_source_two['id'] if exp1 not in expression_data.columns or exp2 not in expression_data.columns: continue col_one = expression_data[exp1] col_two = expression_data[exp2] correlation_coefficient = pearsonr(col_one, col_two) corr_obj = build_obj('correlation', 'expression', 'expression', True, data_source_one, data_source_two, correlation_coefficient[0], correlation_coefficient[1]) corr_list.append(corr_obj) t_value, p_value = ttest_ind(col_one, col_two, equal_var=False) # ttest_obj = build_obj('t-test', 'expression', 'expression', True, # data_source_one, data_source_two, t_value, # p_value) # pvalue_list.append(ttest_obj) # pvalue_list = sorted(pvalue_list, key=lambda x: x['value'], # reverse=True) # yield pvalue_list corr_list = sorted(corr_list, key=lambda x: x['value'], reverse=True) corr_list
def methy_correlation(methy_raw, methylation_diff_types): methy_corr_res = [] if methy_raw.empty: return methy_corr_res # loop through normal/tumor of each tissue type for data_source_one, data_source_two in itertools.combinations( methylation_diff_types, 2): type1 = data_source_one["id"] type2 = data_source_two["id"] if type1.split("_")[0] != type2.split( "_" )[0] or type1 not in methy_raw.columns or type2 not in methy_raw.columns: continue correlation_coefficient = pearsonr(methy_raw[type1], methy_raw[type2]) print(type1, type2) data_range = { 'attr-one': [min(methy_raw[type1]), max(methy_raw[type1])], 'attr-two': [min(methy_raw[type2]), max(methy_raw[type2])] } corr_obj = build_obj('correlation', 'methylation', 'methylation', True, data_source_one, data_source_two, correlation_coefficient[0], correlation_coefficient[1], ranges=data_range) methy_corr_res.append(corr_obj) methy_corr_res = sorted(methy_corr_res, key=lambda x: x['value'], reverse=True) return methy_corr_res
def methy_diff_correlation(methy_diff_data, methylation_diff_types): methy_corr_res = [] if methy_diff_data.empty: return methy_corr_res # loop through every possible combinations of methylation for data_source_one, data_source_two in itertools.combinations( methylation_diff_types, 2): type1 = data_source_one["id"] type2 = data_source_two["id"] # check if there's data for these two methy types if type1 not in methy_diff_data.columns or type2 not in methy_diff_data.columns: continue correlation_coefficient = pearsonr(methy_diff_data[type1], methy_diff_data[type2]) data_range = { 'attr-one': [min(methy_diff_data[type1]), max(methy_diff_data[type1])], 'attr-two': [min(methy_diff_data[type2]), max(methy_diff_data[type2])] } corr_obj = build_obj('correlation', 'methylation diff', 'methylation diff', True, data_source_one, data_source_two, correlation_coefficient[0], correlation_coefficient[1], ranges=data_range) methy_corr_res.append(corr_obj) methy_corr_res = sorted(methy_corr_res, key=lambda x: x['value'], reverse=True) return methy_corr_res
def comp_req(start_seq, end_seq, chromosome, gene_name, measurements=None): print(measurements) # extract data from measurements gene_types = [] block_types = [] methylation_types = [] methylation_diff_types = [] # categorize measurements into different types for measurement in measurements: data_obj = { "id": measurement["id"], "name": measurement["name"], "datasourceId": measurement["datasourceId"] } if measurement["defaultChartType"] == "scatterplot": gene_types.append(data_obj) elif measurement["defaultChartType"] == "block": block_types.append(data_obj) elif measurement["defaultChartType"] == "line": if measurement["datasourceId"] == "timp2014_probelevel_beta": methylation_types.append(data_obj) else: methylation_diff_types.append(data_obj) has_block = len(block_types) > 0 has_methy = len(methylation_types) > 0 has_methy_diff = len(methylation_diff_types) > 0 has_gene = len(gene_types) > 0 expression_data = Gene_data(start_seq, end_seq, chromosome, gene_name, gene_types) block_data = Block_data(start_seq, end_seq, chromosome, gene_name, block_types) per_gene_ttest = statistical_methods.ttest_expression_per_gene( gene_types, expression_data, chromosome, start_seq, end_seq) print("expression_data") print(expression_data) # print("hesdfdssjsdjkdkjckj") # print(per_gene_ttest) ttest_gene = TtestGene(measurements) ttest_gene.compute(chromosome, start_seq, end_seq) yield per_gene_ttest if has_block: # block overlap percentage block_overlap = statistical_methods.block_overlap_percent( block_types, block_data, start_seq, end_seq) # print('hellooooooo') # print(block_overlap) block_ol = OverlapBlock(measurements) block_ol.compute(chromosome, start_seq, end_seq) yield block_overlap if has_methy_diff: methy_raw_diff = Methylation_diff(start_seq, end_seq, chromosome, measurements=methylation_diff_types) methy_diff_corr_res = statistical_methods.methy_diff_correlation( methy_raw_diff, methylation_diff_types) print(methy_diff_corr_res) methy_diff = CorrelationMethy(measurements, "methy_diff") print(methy_diff.compute(chromosome, start_seq, end_seq)) yield methy_diff_corr_res if has_methy: methy_raw = Methylation(start_seq, end_seq, chromosome, measurements=methylation_types) methy_corr_res = statistical_methods.methy_correlation( methy_raw, methylation_diff_types) # print(methy_corr_res) # loop through normal/tumor of each tissue type for data_source_one, data_source_two in itertools.combinations( methylation_diff_types, 2): type1 = data_source_one["id"] type2 = data_source_two["id"] if type1.split("_")[0] != type2.split("_")[0]: continue correlation_coefficient = pearsonr(methy_raw[type1], methy_raw[type2]) print(type1, type2) data_range = { 'attr-one': [min(methy_raw[type1]), max(methy_raw[type1])], 'attr-two': [min(methy_raw[type2]), max(methy_raw[type2])] } corr_obj = build_obj('correlation', 'methylation', 'methylation', True, data_source_one, data_source_two, correlation_coefficient[0], correlation_coefficient[1], ranges=data_range) methy_corr_res.append(corr_obj) methy_corr_res = sorted(methy_corr_res, key=lambda x: x['value'], reverse=True) print(methy_corr_res) methy_corr = CorrelationMethy(measurements, "methy") methy_corr.compute(chromosome, start_seq, end_seq) yield methy_corr_res if has_gene: # expression_data = get_gene_data(start_seq, end_seq, chromosome, # gene_types) corr_list = [] # pvalue_list = [] for data_source_one, data_source_two in itertools.combinations( measurements, 2): exp1 = data_source_one['id'] exp2 = data_source_two['id'] if exp1 not in expression_data.columns or exp2 not in expression_data.columns: continue col_one = expression_data[exp1] col_two = expression_data[exp2] correlation_coefficient = pearsonr(col_one, col_two) corr_obj = build_obj('correlation', 'expression', 'expression', True, data_source_one, data_source_two, correlation_coefficient[0], correlation_coefficient[1]) corr_list.append(corr_obj) t_value, p_value = ttest_ind(col_one, col_two, equal_var=False) corr_list = sorted(corr_list, key=lambda x: x['value'], reverse=True) print(corr_list) corr_exp = CorrelationExp(measurements) print(corr_exp.compute(chromosome, start_seq, end_seq)) yield corr_list if has_gene and has_block: # gene expression and block independency test ttest_block_exp = statistical_methods.ttest_block_expression( expression_data, block_data, gene_types, block_types) print("adfsjfd") print(ttest_block_exp) ttest = TtestBlock(measurements) print("thersdfjsdf") ttest.compute(chromosome, start_seq, end_seq) yield ttest_block_exp if has_gene and has_methy: # correlation between methylation and gene expression # with the same tissue type test_method = CorrelationExpMethy(measurements, "methy") test_method.compute(start_seq, end_seq, chromosome) corr_methy_gene = statistical_methods.expression_methy_correlation( expression_data, gene_types, methylation_types, methy_raw) print(corr_methy_gene) yield corr_methy_gene if has_gene and has_methy_diff: # correlation between methylation difference and gene expression # difference test_method = CorrelationExpMethy(measurements, "methy_diff") test_method.compute(start_seq, end_seq, chromosome) corr_methy_gene = statistical_methods.expression_methydiff_correlation( expression_data, gene_types, methylation_diff_types, methy_raw_diff) print(corr_methy_gene) yield corr_methy_gene
def ttest_block_expression(exp_data, block_data, exp_datasource, datasource_types): ttest_res = [] gene_expression_block = dict() gene_expression_nonblock = dict() # loop through block of different tissue types for block_type, block_dataframe in block_data.items(): if not block_dataframe.empty: # loop through each start, end in the block# only with tissues that align # with block types # get tissue type from block id tissue_type = block_type.split("_")[1] exp_types = [tissue_type + "___normal", tissue_type + "___tumor"] gene_expression_block[block_type] = pd.DataFrame(columns=exp_types) gene_expression_nonblock[block_type] = pd.DataFrame( columns=exp_types) for ind, row in block_dataframe.iterrows(): start = row["start"] end = row["end"] exp_block = pd.DataFrame(columns=exp_types) exp_block = exp_block.append( exp_data[(start <= exp_data["start"]) & (exp_data["start"] <= end)][exp_types]) exp_block = exp_block.append( exp_data[(start <= exp_data["end"]) & (exp_data["end"] <= end)][exp_types]) exp_block = exp_block.append( exp_data[(exp_data["start"] <= start) & (start <= exp_data["end"])][exp_types]) exp_block = exp_block.append( exp_data[((exp_data["start"] <= end) & (end <= exp_data["end"]))][exp_types]) exp_nonblock = exp_data[(exp_data["end"] < start) | (exp_data["start"] > end)][exp_types] gene_expression_block[block_type] = gene_expression_block[ block_type].append(exp_block) gene_expression_nonblock[ block_type] = gene_expression_nonblock[block_type].append( exp_nonblock) print(gene_expression_block.items()) print(gene_expression_nonblock.items()) pd_block = pd.DataFrame(datasource_types) pd_expression = pd.DataFrame(exp_datasource) # calculate t test between block and non - block gene expression of the same# tissue type for block_type, gene_per_block_exp in gene_expression_block.items(): gene_per_nonblock_exp = gene_expression_nonblock[block_type] print(gene_per_block_exp.columns) for exp_type in gene_per_block_exp: gene_block_exp = gene_per_block_exp[exp_type] if gene_block_exp.empty: continue gene_nonblock_exp = gene_per_nonblock_exp[exp_type] t_value, p_value = ttest_ind(gene_block_exp, gene_nonblock_exp, equal_var=False) print("block:" + block_type + ", gene:" + exp_type) print(p_value) gene_ds = json.loads( pd_expression.loc[pd_expression['id'] == exp_type].to_json( orient='records')[1:-1]) block_ds = json.loads( pd_block.loc[pd_block['id'] == block_type].to_json( orient='records')[1:-1]) data = format_expression_block_data(gene_block_exp, gene_nonblock_exp) ttest_obj = build_obj('t-test', 'expression', 'block', False, gene_ds, block_ds, t_value, p_value, data) ttest_res.append(ttest_obj) ttest_res = sorted(ttest_res, key=lambda x: x['value'], reverse=True) return ttest_res
def block_overlap_percent(data_sources, block_data, start_seq, end_seq): block_overlap = [] if not block_data: return block_overlap for data_source_one, data_source_two in itertools.combinations( data_sources, 2): tissue_type_one = data_source_one["id"] tissue_type_two = data_source_two["id"] if tissue_type_one not in block_data or tissue_type_two not in block_data: continue block_tissue_one = block_data[tissue_type_one] block_tissue_two = block_data[tissue_type_two] block_one_ind = 0 block_two_ind = 0 block_one_len = len(block_tissue_one['start']) block_two_len = len(block_tissue_two['start']) overlap_region = [] block_one_region = [] block_two_region = [] # calculate regions for each of the block tissues separately # union regions should be the sum of these regions minus overlap region for start, end in zip(block_tissue_one['start'], block_tissue_one['end']): if min(end, float(end_seq)) > max(start, float(start_seq)): block_one_region.append( min(end, float(end_seq)) - max(start, float(start_seq))) for start, end in zip(block_tissue_two['start'], block_tissue_two['end']): if min(end, float(end_seq)) > max(start, float(start_seq)): block_two_region.append( min(end, float(end_seq)) - max(start, float(start_seq))) while block_one_ind < block_one_len and block_two_ind < block_two_len: tissue_one_start = max(float(start_seq), block_tissue_one['start'][block_one_ind]) tissue_two_start = max(float(start_seq), block_tissue_two['start'][block_two_ind]) tissue_one_end = min(float(end_seq), block_tissue_one['end'][block_one_ind]) tissue_two_end = min(float(end_seq), block_tissue_two['end'][block_two_ind]) # there is an overlap if tissue_one_start <= tissue_two_start < tissue_one_end or \ tissue_one_start < tissue_two_end <= tissue_one_end or \ tissue_two_start <= tissue_one_start < tissue_two_end or \ tissue_two_start < tissue_one_end <= tissue_two_end: common_end = min(tissue_two_end, tissue_one_end) common_start = max(tissue_one_start, tissue_two_start) if common_end > common_start: overlap_region.append(common_end - common_start) if tissue_two_end < tissue_one_end: block_two_ind += 1 else: block_one_ind += 1 # block tissue two is larger elif tissue_two_start >= tissue_one_end: block_one_ind += 1 # block tissue one is larger elif tissue_one_start >= tissue_two_end: block_two_ind += 1 overlap = sum(overlap_region) union = sum(block_one_region) + sum(block_two_region) - overlap block_one_only = max(sum(block_one_region) - overlap, 0) block_two_only = max(sum(block_two_region) - overlap, 0) non_block = max(int(end_seq) - int(start_seq) - union, 0) fisher_table = np.array([[overlap, block_one_only], [block_two_only, non_block]]) odds_ratio, p_value = fisher_exact(fisher_table) if math.isnan(odds_ratio): continue print('p value is ' + str(p_value)) print('odds ratio is ' + str(odds_ratio)) overlap_percent = 0.0 if union == 0.0 else overlap * 1.0 / union overlap_obj = build_obj('overlap', 'block', 'block', False, data_source_one, data_source_two, overlap_percent, p_value) block_overlap.append(overlap_obj) block_overlap = sorted(block_overlap, key=lambda x: x['value'], reverse=True) print("hsdsdfsdf") print(block_overlap) print('overlap done!') return block_overlap