def adeSavGolFeatureCorrelation(data, max_corr, country): print( "Testing Correlation between corresponding L and C AVG statistics for each underlying datatype" ) under_data = ['I2', 'I1', 'GM2', 'GM1', 'FF1', 'MP1', 'MP4', 'MP2'] correlation_results = pd.DataFrame(columns=under_data, index=['Pearson', 'MIC']) cols_to_remove = [] for _ud in under_data: change_and_level = data[[ col_name for col_name in data.columns.values if col_name.startswith(_ud) ]] linear_corr = sp.stats.pearsonr(change_and_level.iloc[:, 0], change_and_level.iloc[:, 1])[0] nonlinear_corr = minerva.mine( vc.FloatVector(np.asarray(change_and_level.iloc[:, 0])), vc.FloatVector(np.asarray(change_and_level.iloc[:, 1])))[0][0] correlation_results[_ud] = pd.Series( [linear_corr, float(nonlinear_corr)], index=['Pearson', 'MIC']) ##removing columns that contain a corellation number of 0.75 if (linear_corr > max_corr) or (nonlinear_corr > max_corr): cols_to_remove = cols_to_remove + ["{}{}".format(_ud, "L")] data = data.loc[:, [ _col for _col in list(data.columns.values) if _col not in cols_to_remove ]] correlation_results.to_excel( "../Reserach/AdeSavGol Transform/L&C correlation/{}.xlsx".format( country), engine="openpyxl") print(correlation_results) return data
def mann_withney_test_r(list_values1, list_values2): wicox = robj.r(""" function(x, y){ test = wilcox.test(x,y, alternative='two.sided', correct=F) return(test$p.value) } """) pval = float(wicox(v.FloatVector(list_values1), v.FloatVector(list_values2))[0]) return pval
def pvalue_getter(list1, list2): """ :param list1: list of float :param list2: list of float :return: a p value """ print(list1) print(list2) wilcox = robj.r(""" function(list1, list2){ return(wilcox.test(list1,list2)$p.value) } """) return float(wilcox(v.FloatVector(list1), v.FloatVector(list2))[0])
def frequency_test(obs1, tot1, obs2, tot2): """ Chiq test :param obs1: (int) the count number of an amino acid X in the set of protein 1. :param tot1: (int) the total number of amino acids in the set of protein 1. :param obs2: (int) the count number of an amino acid X in the set of protein 2. :param tot2: (int) the total number of amino acids in the set of protein 2. :return: proportion test p-value """ chisq = robj.r(""" function(vect){ m<-matrix(vect, byrow=T, nrow=2) return(chisq.test(m)$p.value) } """) rm1 = tot1 - obs1 rm2 = tot2 - obs2 vect = v.FloatVector([obs1, rm1, obs2, rm2]) pval = float(chisq(vect)[0]) print(obs1, rm1, obs2, rm2) print(pval) if np.isnan(pval): return "NA" else: return pval
def create_statistical_report(list_values, list_name, ctrl_full, filename, nt): """ Create a statistical report. :param list_values: (list of list of floats) the list of value that we want to compare to a control list :param list_name: (list of string) the name of each sublist of float in ``list_values`` :param ctrl_full: (list of float) the control list of values :param filename: (string) the name of the figure associated with those stat :param nt: (string) the nucleotide studied """ if not nt: cur_ctrl = np.array(ctrl_full, dtype=float) else: cur_ctrl = np.array(ctrl_full[nt], dtype=float) cur_ctrl = list(cur_ctrl[~np.isnan(cur_ctrl)]) dic_res = {"Factor": [], "P-value": []} for i in range(len(list_values)): dic_res["Factor"].append(list_name[i]) cur_list = np.array(list_values[i], dtype=float) cur_list = list(cur_list[~np.isnan(cur_list)]) # print(" Factor : %s, mean = %s" % (list_name[i], np.nanmean(list_values[i]))) dic_res["P-value"].append(statistical_analysis.mann_withney_test_r(cur_list, cur_ctrl)) df = pd.DataFrame(dic_res) rstats = robj.packages.importr('stats') pcor = rstats.p_adjust(v.FloatVector(dic_res["P-value"]), method="BH") df["P-adjusted_BH"] = pcor df.to_csv(filename.replace(".html", "wilcox_stat.txt"), sep="\t", index=False)
def adjust_pvalues(pvalues): """ correct a list of pvalues :param pvalues: (list of float) list of pvalues :return: (list of float) list of pvalues corrected """ rstats = robj.packages.importr('stats') pcor = np.array(rstats.p_adjust(v.FloatVector(pvalues), method="BH")) return list(pcor)
def mann_withney_test_r(list_values1, list_values2): """ Perform a mann withney wilcoxon test on ``list_values1`` and ``list_values2``. :param list_values1: (list of float) list of float :param list_values2: (list of float) list of float :return: (float) the pvalue of the mann withney test done one `list_values1`` and ``list_values2``. """ wicox = robj.r(""" function(x, y){ test = wilcox.test(x,y, alternative='two.sided', correct=F) return(test$p.value) } """) pval = float(wicox(v.FloatVector(list_values1), v.FloatVector(list_values2))[0]) return pval
def mann_withney_test_r(list_values1, list_values2, alt="less"): """ Perform a mann withney wilcoxon test on ``list_values1`` and ``list_values2``. :param list_values1: (list of float) list of float :param list_values2: (list of float) list of float :param alt: (string) the alternative hypothesis selected :return: (float) the pvalue of the mann withney test done one `list_values1`` and ``list_values2``. """ wicox = robj.r(""" function(x, y){ test = wilcox.test(x,y, alternative="%s", correct=F) return(test$p.value) } """ % alt) pval = float(wicox(v.FloatVector(list_values1), v.FloatVector(list_values2))[0]) return pval
def r_ttest(x, y): """ :param x: (list of float value) :param y: (list of float value) :return: R t test p value """ import rpy2.robjects as robj import rpy2.robjects.vectors as v ttestmaker = robj.r(""" function(x,y){ test = t.test(x,y) return(test$p.value) }""") try: pval = ttestmaker(v.FloatVector(x), v.FloatVector(y)) pval = float(pval[0]) except: pval = float("nan") return pval
def _to_mccm_vec(x): """ Compose vector into format for multispatialCCM.R Args: x (np.array): Data vector Returns: (rvec.FloatVector) Data in R format """ mccm_x = np.array([[np.nan] + list(x) for x in list(x)]) mccm_x = mccm_x.ravel() return rvec.FloatVector(mccm_x)
def adeSavGolCorrelation(country): data = bpp.algo.importData(country) algos = ['I2', 'I1', 'GM2', 'GM1', 'FF1', 'MP1', 'MP4', 'MP2'] correlation_results = pd.DataFrame(columns=algos, index=['Pearson', 'MIC']) for _algo in algos: change_and_level = data[[ col_name for col_name in data.columns.values if col_name.startswith(_algo) ]] linear_corr = sp.stats.pearsonr(change_and_level.iloc[:, 0], change_and_level.iloc[:, 1]) nonlinear_corr = minerva.mine( vc.FloatVector(np.asarray(change_and_level.iloc[:, 0])), vc.FloatVector(np.asarray(change_and_level.iloc[:, 1]))) correlation_results[_algo] = pd.Series( [linear_corr[0], float(nonlinear_corr[0][0])], index=['Pearson', 'MIC']) print(correlation_results) correlation_results.to_excel( "../Reserach/AdeSavGol Transform/L&C_corr.xlsx", engine="openpyxl")
def perform_mann_withney_test(dataframe, sf_name, exon_type): """ From a dataframe of value perform a Mann Withney Wilcoxon test. :param dataframe: (pandas DataFrame) :param sf_name: (string) :param exon_type: (string) :return: (pandas dataFrame) """ rstats = importr('stats') list_ctrl = np.array( dataframe[dataframe["project"] == exon_type].loc[:, "values"].values, dtype=float) list_ctrl = list(list_ctrl[~np.isnan(list_ctrl)]) pval_list = [] for my_sf in sf_name: test_list = np.array( dataframe[dataframe["project"] == my_sf].loc[:, "values"].values, dtype=float) test_list = list(test_list[~np.isnan(test_list)]) pval_list.append(mann_withney_test_r(test_list, list_ctrl)) pcor = rstats.p_adjust(v.FloatVector(pval_list), method="BH") df = pd.DataFrame({"SF": sf_name, "pval_MW": pval_list, "p_adj": pcor}) return df[["SF", "pval_MW", "p_adj"]]
import pytest from rpy2.robjects import vectors from rpy2.robjects.packages import importr from rpy2.ipython import html base = importr('base') @pytest.mark.parametrize( 'o,func', [(vectors.IntVector([1, 2, 3]), html.html_vector_horizontal), (vectors.FloatVector([1, 2, 3]), html.html_vector_horizontal), (vectors.StrVector(['a', 'b' 'c']), html.html_vector_horizontal), (vectors.FactorVector(['a', 'b' 'c']), html.html_vector_horizontal), (vectors.ListVector({ 'a': 1, 'b': 2 }), html.html_rlist), (vectors.DataFrame({ 'a': 1, 'b': 'z' }), html.html_rdataframe), ('x <- c(1, 2, 3)', html.html_sourcecode), (base.c, html.html_ridentifiedobject)]) def test_html_func(o, func): res = func(o) assert isinstance(res, str)
def main(): # Handle input params in_fname = sys.argv[1] out_fname = sys.argv[2] try: column = int(sys.argv[3]) - 1 except Exception: sys.exit("Column not specified, your query does not contain a column of numerical data.") title = sys.argv[4] xlab = sys.argv[5] breaks = int(sys.argv[6]) if breaks == 0: breaks = "Sturges" if sys.argv[7] == "true": density = True else: density = False if len(sys.argv) >= 9 and sys.argv[8] == "true": frequency = True else: frequency = False matrix = [] skipped_lines = 0 first_invalid_line = 0 invalid_value = '' i = 0 for i, line in enumerate(open(in_fname)): valid = True line = line.rstrip('\r\n') # Skip comments if line and not line.startswith('#'): # Extract values and convert to floats row = [] try: fields = line.split("\t") val = fields[column] if val.lower() == "na": row.append(float("nan")) except Exception: valid = False skipped_lines += 1 if not first_invalid_line: first_invalid_line = i + 1 else: try: row.append(float(val)) except ValueError: valid = False skipped_lines += 1 if not first_invalid_line: first_invalid_line = i + 1 invalid_value = fields[column] else: valid = False skipped_lines += 1 if not first_invalid_line: first_invalid_line = i + 1 if valid: matrix.extend(row) if skipped_lines < i: try: grdevices = importr('grDevices') graphics = importr('graphics') vector = vectors.FloatVector(matrix) grdevices.pdf(out_fname, 8, 8) histogram = graphics.hist(vector, probability=not frequency, main=title, xlab=xlab, breaks=breaks) if density: density = r.density(vector) if frequency: scale_factor = len(matrix) * (histogram['mids'][1] - histogram['mids'][0]) # uniform bandwidth taken from first 2 midpoints density['y'] = map(lambda x: x * scale_factor, density['y']) graphics.lines(density) grdevices.dev_off() except Exception as exc: sys.exit("%s" % str(exc)) else: if i == 0: sys.exit("Input dataset is empty.") else: sys.exit("All values in column %s are non-numeric." % sys.argv[3]) print("Histogram of column %s. " % sys.argv[3]) if skipped_lines > 0: print("Skipped %d invalid lines starting with line #%d, '%s'." % (skipped_lines, first_invalid_line, invalid_value))
def shapiro_test(list_val): """ :param list_val: (list of float) list of frequency for a feature corresponding to a set of exon :return: (float) p-value of the shapiro-wilk test """ shapiro_test = rpy2.robjects.r( """ function(list_val){ return(shapiro.test(list_val)$p.value) } """ ) return shapiro_test(v.FloatVector(list_val))[0] def comparison_test(list_val1, list_val2, test): """ :param list_val1: (list of float) list of frequency for a feature corresponding to a set of exon :param list_val2: (list of float) list of frequency for a feature corresponding to another set of exon :param test: (string) the type of test to use :return: (float) pvalue of the comparison test """ ttest = rpy2.robjects.r( """ function(list_val1, list_val2){ return(t.test(list_val1, list_val2)$p.value) }