Exemple #1
0
def expression_correlation(gene_types, expression_data):

    corr_list = []
    # pvalue_list = []
    for data_source_one, data_source_two in itertools.combinations(
            gene_types, 2):
        exp1 = data_source_one['id']
        exp2 = data_source_two['id']

        if exp1 not in expression_data.columns or exp2 not in expression_data.columns:
            continue

        col_one = expression_data[exp1]
        col_two = expression_data[exp2]

        correlation_coefficient = pearsonr(col_one, col_two)
        corr_obj = build_obj('correlation', 'expression', 'expression', True,
                             data_source_one, data_source_two,
                             correlation_coefficient[0],
                             correlation_coefficient[1])
        corr_list.append(corr_obj)

        t_value, p_value = ttest_ind(col_one, col_two, equal_var=False)
        # ttest_obj = build_obj('t-test', 'expression', 'expression', True,
        #                       data_source_one, data_source_two, t_value,
        #                       p_value)
        # pvalue_list.append(ttest_obj)

    # pvalue_list = sorted(pvalue_list, key=lambda x: x['value'],
    #                      reverse=True)
    # yield pvalue_list

    corr_list = sorted(corr_list, key=lambda x: x['value'], reverse=True)
    corr_list
Exemple #2
0
def methy_correlation(methy_raw, methylation_diff_types):
    methy_corr_res = []

    if methy_raw.empty:
        return methy_corr_res
    # loop through normal/tumor of each tissue type
    for data_source_one, data_source_two in itertools.combinations(
            methylation_diff_types, 2):
        type1 = data_source_one["id"]
        type2 = data_source_two["id"]
        if type1.split("_")[0] != type2.split(
                "_"
        )[0] or type1 not in methy_raw.columns or type2 not in methy_raw.columns:
            continue

        correlation_coefficient = pearsonr(methy_raw[type1], methy_raw[type2])

        print(type1, type2)
        data_range = {
            'attr-one': [min(methy_raw[type1]),
                         max(methy_raw[type1])],
            'attr-two': [min(methy_raw[type2]),
                         max(methy_raw[type2])]
        }
        corr_obj = build_obj('correlation',
                             'methylation',
                             'methylation',
                             True,
                             data_source_one,
                             data_source_two,
                             correlation_coefficient[0],
                             correlation_coefficient[1],
                             ranges=data_range)
        methy_corr_res.append(corr_obj)
    methy_corr_res = sorted(methy_corr_res,
                            key=lambda x: x['value'],
                            reverse=True)
    return methy_corr_res
Exemple #3
0
def methy_diff_correlation(methy_diff_data, methylation_diff_types):
    methy_corr_res = []
    if methy_diff_data.empty:
        return methy_corr_res
    # loop through every possible combinations of methylation
    for data_source_one, data_source_two in itertools.combinations(
            methylation_diff_types, 2):
        type1 = data_source_one["id"]
        type2 = data_source_two["id"]

        # check if there's data for these two methy types
        if type1 not in methy_diff_data.columns or type2 not in methy_diff_data.columns:
            continue

        correlation_coefficient = pearsonr(methy_diff_data[type1],
                                           methy_diff_data[type2])
        data_range = {
            'attr-one':
            [min(methy_diff_data[type1]),
             max(methy_diff_data[type1])],
            'attr-two':
            [min(methy_diff_data[type2]),
             max(methy_diff_data[type2])]
        }
        corr_obj = build_obj('correlation',
                             'methylation diff',
                             'methylation diff',
                             True,
                             data_source_one,
                             data_source_two,
                             correlation_coefficient[0],
                             correlation_coefficient[1],
                             ranges=data_range)
        methy_corr_res.append(corr_obj)
    methy_corr_res = sorted(methy_corr_res,
                            key=lambda x: x['value'],
                            reverse=True)
    return methy_corr_res
Exemple #4
0
def comp_req(start_seq, end_seq, chromosome, gene_name, measurements=None):

    print(measurements)

    # extract data from measurements
    gene_types = []
    block_types = []

    methylation_types = []
    methylation_diff_types = []
    # categorize measurements into different types
    for measurement in measurements:
        data_obj = {
            "id": measurement["id"],
            "name": measurement["name"],
            "datasourceId": measurement["datasourceId"]
        }
        if measurement["defaultChartType"] == "scatterplot":
            gene_types.append(data_obj)
        elif measurement["defaultChartType"] == "block":
            block_types.append(data_obj)
        elif measurement["defaultChartType"] == "line":
            if measurement["datasourceId"] == "timp2014_probelevel_beta":
                methylation_types.append(data_obj)
            else:
                methylation_diff_types.append(data_obj)

    has_block = len(block_types) > 0
    has_methy = len(methylation_types) > 0
    has_methy_diff = len(methylation_diff_types) > 0
    has_gene = len(gene_types) > 0
    expression_data = Gene_data(start_seq, end_seq, chromosome, gene_name,
                                gene_types)
    block_data = Block_data(start_seq, end_seq, chromosome, gene_name,
                            block_types)
    per_gene_ttest = statistical_methods.ttest_expression_per_gene(
        gene_types, expression_data, chromosome, start_seq, end_seq)
    print("expression_data")
    print(expression_data)
    # print("hesdfdssjsdjkdkjckj")
    # print(per_gene_ttest)
    ttest_gene = TtestGene(measurements)
    ttest_gene.compute(chromosome, start_seq, end_seq)

    yield per_gene_ttest
    if has_block:
        # block overlap percentage
        block_overlap = statistical_methods.block_overlap_percent(
            block_types, block_data, start_seq, end_seq)
        # print('hellooooooo')
        # print(block_overlap)
        block_ol = OverlapBlock(measurements)
        block_ol.compute(chromosome, start_seq, end_seq)

        yield block_overlap
    if has_methy_diff:
        methy_raw_diff = Methylation_diff(start_seq,
                                          end_seq,
                                          chromosome,
                                          measurements=methylation_diff_types)
        methy_diff_corr_res = statistical_methods.methy_diff_correlation(
            methy_raw_diff, methylation_diff_types)
        print(methy_diff_corr_res)
        methy_diff = CorrelationMethy(measurements, "methy_diff")
        print(methy_diff.compute(chromosome, start_seq, end_seq))
        yield methy_diff_corr_res
    if has_methy:

        methy_raw = Methylation(start_seq,
                                end_seq,
                                chromosome,
                                measurements=methylation_types)
        methy_corr_res = statistical_methods.methy_correlation(
            methy_raw, methylation_diff_types)

        #  print(methy_corr_res)
        # loop through normal/tumor of each tissue type
        for data_source_one, data_source_two in itertools.combinations(
                methylation_diff_types, 2):
            type1 = data_source_one["id"]
            type2 = data_source_two["id"]
            if type1.split("_")[0] != type2.split("_")[0]:
                continue

            correlation_coefficient = pearsonr(methy_raw[type1],
                                               methy_raw[type2])

            print(type1, type2)
            data_range = {
                'attr-one': [min(methy_raw[type1]),
                             max(methy_raw[type1])],
                'attr-two': [min(methy_raw[type2]),
                             max(methy_raw[type2])]
            }
            corr_obj = build_obj('correlation',
                                 'methylation',
                                 'methylation',
                                 True,
                                 data_source_one,
                                 data_source_two,
                                 correlation_coefficient[0],
                                 correlation_coefficient[1],
                                 ranges=data_range)
            methy_corr_res.append(corr_obj)
        methy_corr_res = sorted(methy_corr_res,
                                key=lambda x: x['value'],
                                reverse=True)
        print(methy_corr_res)
        methy_corr = CorrelationMethy(measurements, "methy")
        methy_corr.compute(chromosome, start_seq, end_seq)

        yield methy_corr_res
    if has_gene:
        # expression_data = get_gene_data(start_seq, end_seq, chromosome,
        #                                 gene_types)
        corr_list = []
        # pvalue_list = []
        for data_source_one, data_source_two in itertools.combinations(
                measurements, 2):
            exp1 = data_source_one['id']
            exp2 = data_source_two['id']

            if exp1 not in expression_data.columns or exp2 not in expression_data.columns:
                continue

            col_one = expression_data[exp1]
            col_two = expression_data[exp2]

            correlation_coefficient = pearsonr(col_one, col_two)
            corr_obj = build_obj('correlation', 'expression', 'expression',
                                 True, data_source_one, data_source_two,
                                 correlation_coefficient[0],
                                 correlation_coefficient[1])
            corr_list.append(corr_obj)

            t_value, p_value = ttest_ind(col_one, col_two, equal_var=False)
        corr_list = sorted(corr_list, key=lambda x: x['value'], reverse=True)

        print(corr_list)
        corr_exp = CorrelationExp(measurements)
        print(corr_exp.compute(chromosome, start_seq, end_seq))
        yield corr_list
    if has_gene and has_block:

        # gene expression and block independency test
        ttest_block_exp = statistical_methods.ttest_block_expression(
            expression_data, block_data, gene_types, block_types)
        print("adfsjfd")
        print(ttest_block_exp)
        ttest = TtestBlock(measurements)
        print("thersdfjsdf")
        ttest.compute(chromosome, start_seq, end_seq)
        yield ttest_block_exp
    if has_gene and has_methy:

        # correlation between methylation and gene expression
        # with the same tissue type
        test_method = CorrelationExpMethy(measurements, "methy")
        test_method.compute(start_seq, end_seq, chromosome)
        corr_methy_gene = statistical_methods.expression_methy_correlation(
            expression_data, gene_types, methylation_types, methy_raw)
        print(corr_methy_gene)
        yield corr_methy_gene
    if has_gene and has_methy_diff:

        # correlation between methylation difference and gene expression
        # difference
        test_method = CorrelationExpMethy(measurements, "methy_diff")
        test_method.compute(start_seq, end_seq, chromosome)
        corr_methy_gene = statistical_methods.expression_methydiff_correlation(
            expression_data, gene_types, methylation_diff_types,
            methy_raw_diff)
        print(corr_methy_gene)

        yield corr_methy_gene
Exemple #5
0
def ttest_block_expression(exp_data, block_data, exp_datasource,
                           datasource_types):
    ttest_res = []
    gene_expression_block = dict()
    gene_expression_nonblock = dict()

    # loop through block of different tissue types
    for block_type, block_dataframe in block_data.items():
        if not block_dataframe.empty:
            # loop through each start, end in the block# only with tissues that align
            # with block types
            # get tissue type from block id
            tissue_type = block_type.split("_")[1]
            exp_types = [tissue_type + "___normal", tissue_type + "___tumor"]

            gene_expression_block[block_type] = pd.DataFrame(columns=exp_types)
            gene_expression_nonblock[block_type] = pd.DataFrame(
                columns=exp_types)
            for ind, row in block_dataframe.iterrows():
                start = row["start"]
                end = row["end"]
                exp_block = pd.DataFrame(columns=exp_types)
                exp_block = exp_block.append(
                    exp_data[(start <= exp_data["start"])
                             & (exp_data["start"] <= end)][exp_types])
                exp_block = exp_block.append(
                    exp_data[(start <= exp_data["end"])
                             & (exp_data["end"] <= end)][exp_types])
                exp_block = exp_block.append(
                    exp_data[(exp_data["start"] <= start)
                             & (start <= exp_data["end"])][exp_types])

                exp_block = exp_block.append(
                    exp_data[((exp_data["start"] <= end) &
                              (end <= exp_data["end"]))][exp_types])

                exp_nonblock = exp_data[(exp_data["end"] < start) |
                                        (exp_data["start"] > end)][exp_types]
                gene_expression_block[block_type] = gene_expression_block[
                    block_type].append(exp_block)
                gene_expression_nonblock[
                    block_type] = gene_expression_nonblock[block_type].append(
                        exp_nonblock)

    print(gene_expression_block.items())
    print(gene_expression_nonblock.items())

    pd_block = pd.DataFrame(datasource_types)
    pd_expression = pd.DataFrame(exp_datasource)

    # calculate t test between block and non - block gene expression of the same# tissue type
    for block_type, gene_per_block_exp in gene_expression_block.items():
        gene_per_nonblock_exp = gene_expression_nonblock[block_type]
        print(gene_per_block_exp.columns)
        for exp_type in gene_per_block_exp:

            gene_block_exp = gene_per_block_exp[exp_type]

            if gene_block_exp.empty:
                continue

            gene_nonblock_exp = gene_per_nonblock_exp[exp_type]

            t_value, p_value = ttest_ind(gene_block_exp,
                                         gene_nonblock_exp,
                                         equal_var=False)
            print("block:" + block_type + ", gene:" + exp_type)
            print(p_value)
            gene_ds = json.loads(
                pd_expression.loc[pd_expression['id'] == exp_type].to_json(
                    orient='records')[1:-1])
            block_ds = json.loads(
                pd_block.loc[pd_block['id'] == block_type].to_json(
                    orient='records')[1:-1])

            data = format_expression_block_data(gene_block_exp,
                                                gene_nonblock_exp)

            ttest_obj = build_obj('t-test', 'expression', 'block', False,
                                  gene_ds, block_ds, t_value, p_value, data)

            ttest_res.append(ttest_obj)

    ttest_res = sorted(ttest_res, key=lambda x: x['value'], reverse=True)

    return ttest_res
Exemple #6
0
def block_overlap_percent(data_sources, block_data, start_seq, end_seq):
    block_overlap = []
    if not block_data:
        return block_overlap

    for data_source_one, data_source_two in itertools.combinations(
            data_sources, 2):
        tissue_type_one = data_source_one["id"]
        tissue_type_two = data_source_two["id"]

        if tissue_type_one not in block_data or tissue_type_two not in block_data:
            continue

        block_tissue_one = block_data[tissue_type_one]
        block_tissue_two = block_data[tissue_type_two]

        block_one_ind = 0
        block_two_ind = 0
        block_one_len = len(block_tissue_one['start'])
        block_two_len = len(block_tissue_two['start'])

        overlap_region = []
        block_one_region = []
        block_two_region = []

        # calculate regions for each of the block tissues separately
        # union regions should be the sum of these regions minus overlap region
        for start, end in zip(block_tissue_one['start'],
                              block_tissue_one['end']):
            if min(end, float(end_seq)) > max(start, float(start_seq)):
                block_one_region.append(
                    min(end, float(end_seq)) - max(start, float(start_seq)))

        for start, end in zip(block_tissue_two['start'],
                              block_tissue_two['end']):
            if min(end, float(end_seq)) > max(start, float(start_seq)):
                block_two_region.append(
                    min(end, float(end_seq)) - max(start, float(start_seq)))

        while block_one_ind < block_one_len and block_two_ind < block_two_len:
            tissue_one_start = max(float(start_seq),
                                   block_tissue_one['start'][block_one_ind])
            tissue_two_start = max(float(start_seq),
                                   block_tissue_two['start'][block_two_ind])
            tissue_one_end = min(float(end_seq),
                                 block_tissue_one['end'][block_one_ind])
            tissue_two_end = min(float(end_seq),
                                 block_tissue_two['end'][block_two_ind])

            # there is an overlap
            if tissue_one_start <= tissue_two_start < tissue_one_end or \
               tissue_one_start < tissue_two_end <= tissue_one_end or \
               tissue_two_start <= tissue_one_start < tissue_two_end or \
               tissue_two_start < tissue_one_end <= tissue_two_end:
                common_end = min(tissue_two_end, tissue_one_end)
                common_start = max(tissue_one_start, tissue_two_start)
                if common_end > common_start:
                    overlap_region.append(common_end - common_start)
                if tissue_two_end < tissue_one_end:
                    block_two_ind += 1
                else:
                    block_one_ind += 1
            # block tissue two is larger
            elif tissue_two_start >= tissue_one_end:
                block_one_ind += 1
            # block tissue one is larger
            elif tissue_one_start >= tissue_two_end:
                block_two_ind += 1

        overlap = sum(overlap_region)
        union = sum(block_one_region) + sum(block_two_region) - overlap
        block_one_only = max(sum(block_one_region) - overlap, 0)
        block_two_only = max(sum(block_two_region) - overlap, 0)
        non_block = max(int(end_seq) - int(start_seq) - union, 0)
        fisher_table = np.array([[overlap, block_one_only],
                                 [block_two_only, non_block]])

        odds_ratio, p_value = fisher_exact(fisher_table)
        if math.isnan(odds_ratio):
            continue
        print('p value is ' + str(p_value))
        print('odds ratio is ' + str(odds_ratio))
        overlap_percent = 0.0 if union == 0.0 else overlap * 1.0 / union
        overlap_obj = build_obj('overlap', 'block', 'block', False,
                                data_source_one, data_source_two,
                                overlap_percent, p_value)
        block_overlap.append(overlap_obj)

    block_overlap = sorted(block_overlap,
                           key=lambda x: x['value'],
                           reverse=True)
    print("hsdsdfsdf")
    print(block_overlap)
    print('overlap done!')
    return block_overlap