def ttest_location(channel, location, df): """ t test single location and compare against its channel """ channel_array = np.ravel(df.loc[channel]) location_array = np.ravel(df.loc[location]) return stats.ttest(location_array, channel_array)
def run(self, experiment: Experiment) -> SPIAResult: """ Returns: a list of pathways: pathway id, pathway name, pNDE, pPERT, pG, FDR correction, Bonferroni correction, status for each pathway """ pvalue = ttest(experiment) <= self.threshold calc_f = experiment.calculate_fold_change() all = pvalue.index.tolist() for a in range(len(all)): all[a] = all[a].name p = pvalue[pvalue == True].index.tolist() de = {} for i in p: de[i.name] = calc_f["FC"][i] json = {} if len(de) == 0: # if there are no DEGs anywhere, the problem of finding the impact on various pathways is meaningless print('No differentialy expressed genes.') return SPIAResult([]) db = KEGGPathways(self.organism) pathways = {} for gene in de.keys(): ps = db.search_by_gene(gene) for (k, v) in ps.items(): if k not in pathways.keys(): pathways[k] = v if not pathways: print('No pathways found in database.') return SPIAResult([]) for (id, descr) in pathways.items(): pathway = db.get_pathway(id) path_genes = set(pathway.nodes) path_genes = list(path_genes) interaction_list = {i: [] for i in rel} x = get_edge_attributes(pathway, 'type') for gene1, interaction in x.items(): interaction = '_'.join(interaction) if interaction in interaction_list.keys(): interaction_list[interaction].append([ path_genes.index(gene1[0]), path_genes.index(gene1[1]) ]) else: interaction_list[interaction] = [[ path_genes.index(gene1[0]), path_genes.index(gene1[1]) ]] interaction_list['row_names'] = path_genes json[id] = interaction_list json['id2name'] = pathways s = SPIA.calculate_spia(de, all, json) result = SPIAResult(s) if self.markdown: result.generate_markdown(self.markdown, 'Results of SPIA:') return result
def test_ttest(): data1 = {'BAD': 1.2345, 'FUCA2': 6.5432} data2 = {'BAD': 2.3456, 'FUCA2': 7.6543} data3 = {'BAD': 6.3456, 'FUCA2': 11.6543} data4 = {'BAD': 7.1111, 'FUCA2': 9.9711} tumour_samples = [Sample.from_names('Tumour_1', data1), Sample.from_names('Tumour_2', data2)] normal_samples = [Sample.from_names('Normal_1', data3), Sample.from_names('Normal_2', data4)] tumour = SampleCollection('Tumour', tumour_samples) normal = SampleCollection('Normal', normal_samples) experiment = Experiment(case=tumour, control=normal) tt = ttest(experiment) assert isinstance(tt, pd.Series) assert all(gene in list(tt.keys()) for gene in experiment.get_all().genes)
def test_ttest_norm(self): norm = ttest(self.cart_array2, self.cart_array3) test_result = 1 - ttest_ind( self.cart_array2, self.cart_array3, equal_var=False).pvalue self.assertTrue(float_equals(norm, test_result))
def test_ttest_short2(self): short2 = ttest(self.cart_array4, self.cart_array3) self.assertTrue(float_equals(c.single_item_cart_max, short2))
def test_ttest_less(self): less = ttest(self.cart_array1, self.cart_array2) self.assertTrue(float_equals(c.low_score, less))
def run(self, experiment: Experiment) -> ImpactAnalysisResult: """ Returns: list of pathways sorted by their impact factor. Each pathway in the list has values of FDR and Bonferroni corrections assigned. """ self.experiment_genes = set( [gene.name for gene in experiment.get_all().genes]) # calculate fold change self.FC = experiment.calculate_fold_change() # remove genes for witch fold change cannot be calculated correctly experiment.exclude_genes( list(self.FC['FC'][isnan(self.FC['FC'])].index)) if self.degs: self.degs = pd.Series({ Gene(x): True for x in self.degs if Gene(x) not in self.experiment_genes }) else: # select differentialy expressed genes pvalue = ttest(experiment) <= self.threshold self.degs = pvalue[pvalue == True] if self.degs.size == 0: # if there are no DEGs anywhere, the problem of finding the impact on various pathways is meaningless print('No differentialy expressed genes.') return ImpactAnalysisResult([]) db = KEGGPathways(self.org) pathways = {} for gene in [g.name for g in list(self.degs.index)]: ps = db.search_by_gene(gene) for (k, v) in ps.items(): if k not in pathways.keys(): pathways[k] = v if not pathways: print('No pathways found in database.') return ImpactAnalysisResult([]) res = pd.DataFrame(columns=['name', 'IF', 'pvalue']) for (code, descr) in pathways.items(): pathway = db.get_pathway(code) impact_factor, pval = self.calculate_impact_factor( experiment, pathway) if impact_factor is not None and pval is not None: res.loc[len(res.index)] = [descr, impact_factor, pval] res['FDR'], res['Bonferroni'] = self.calculate_corrections( res['pvalue']) ifp_pathways = [IAPathway(res.loc[i]) for i in range(len(res.index))] ifp_pathways.sort(key=lambda x: x.IF if not isnan(x.IF) else 0, reverse=True) result = ImpactAnalysisResult(ifp_pathways) if self.markdown: result.generate_markdown(self.markdown, 'Results of Impact Analysis:') return result
def get_group_figure(self): """Group average figure of skeleton - skeleton group average as ahline - skeleton subject average as ahline - tests between subject averages between groups """ plt.style.use('default') self.g = sns.catplot(x='group', y='mean', hue='group', hue_order=self.df.group.unique(), data=self.df) if self.df['mean'].mean() < 0.005: self.g.ax.set_ylim( self.df['mean'].min() - (self.df['mean'].std() / 3), self.df['mean'].max() - (self.df['mean'].std() / 3)) self.g.fig.set_size_inches(8, 4) self.g.fig.set_dpi(150) self.g.ax.set_ylabel(f'{self.modality}') self.g.ax.set_xlabel('Group') self.g.ax.set_title( f'Average {self.modality} in skeleton for all subjects', fontweight='bold') # tick labels to have number of groups def get_ticklabels(tmp_df, group): row_count_for_group = len(tmp_df[tmp_df.group == group]) return f'{group} ({row_count_for_group})' self.g.ax.set_xticklabels( [get_ticklabels(self.df, x) for x in self.df.group.unique()]) # group_list = corrpMap.group_labels # print(group_list) # average line line_width = 0.3 gb = self.df.groupby('group') for num, group in enumerate(self.df.group.unique()): table = gb.get_group(group) average = table['mean'].mean() self.g.ax.plot([num - line_width, num + line_width], [average, average]) # Add stat information to the graph height = 0.9 two_groups_perm = list(combinations(self.df.group.unique(), 2)) # if two groups if len(self.df.group.unique()) == 2: height_step = 0.8 / len(two_groups_perm) else: height_step = 0.8 / (len(two_groups_perm) + 1) # two group comparisons # TODO: add ANCOVA for g1, g2 in two_groups_perm: gb = self.df.groupby('group') g1_means = gb.get_group(g1)['mean'] g2_means = gb.get_group(g2)['mean'] # t, p = ss.ttest_ind(g1_means, g2_means) t, p, dof = ttest(g1_means, g2_means) if p < 0.05: text = f'{g1} vs {g2}\nT ({int(dof)}) = {t:.2f}, P = {p:.2f}*' else: text = f'{g1} vs {g2}\nT ({int(dof)}) = {t:.2f}, P = {p:.2f}' self.g.ax.text(1, height, text, transform=self.g.ax.transAxes) height -= height_step # ANCOVA if there are more than two groups if len(self.df.group.unique()) > 2: anova_df = anova(self.df, 'mean ~ group') f_val = anova_df.loc['group', 'F'] dof = anova_df.loc['group', 'df'] p = anova_df.loc['group', 'PR(>F)'] if p < 0.05: text = f'ANOVA\nF ({int(dof)}) = '\ f'{f_val:.2f}, P = {p:.2f}*' else: text = f'ANOVA\nF ({int(dof)}) = '\ f'{f_val:.2f}, P = {p:.2f}' self.g.ax.text(1, height, text, transform=self.g.ax.transAxes)
saveText(collegeData) return collegeData collegeData = getFullData(colleges, reddit) # --- Running T-Test clg1Pol = clg.getPolarityData("princeton", reddit) clg2Pol = clg.getPolarityData("mit", reddit) # End T-Test print(stats.ttest(clg1Pol, clg2Pol)) # --- Creating Bar Chart --- collegePolarity = [] for i in collegeData: collegePolarity.append(collegeData[i][0]) fig, ax = plt.subplots() rect = plt.bar(x=range(len(collegeData)), height=collegePolarity, color=["blue"] * 3 + ["red"] + ["blue"] * 16) ax.set_ylabel('Average Sentiment score vs Colleges') ax.set_title('Sentiment Score') plt.xticks(rotation=90)