def merge_case_controlDF_and_afterExclutionMD(afterExclusion_MD, case_controlDF): ''' Combines case_controlDF with afterExclution_MD and returns it as a metadata object Parameters ---------- afterExclusion_MD : Metadata object Metadata object with unwanted samples filtered out case_controlDF : dataframe dataframe with one column named case_control. The indexs are the same as the indexs of afterExclution_MD values reflect if the index is a case, control, or Undefined Returns ------- Metadata(returnedMD) : Metadata object Metadata object with unwanted samples filtered out and a case_control column that reflects if the index is a case, control, or Undefined ''' #turns case_controlDF into a metadata object case_controlMD = Metadata(case_controlDF) index_header_name = afterExclusion_MD.id_header #merges afterExclution_MD and case_controlMD into one new metadata object mergedMD = Metadata.merge(afterExclusion_MD, case_controlMD) return mergedMD
def determine_cases_and_controls(verbose, afterExclusion_MD, query_line_dict, case_controlDF): ''' Determines what samples are cases or controls using the queries in query_line_array. The labels of each sample are stored in case_controlDF Parameters ---------- verbose: boolean Tells function if it should output print statements or not. True outputs print statements. afterExclusion_MD : Metadata object Metadata object with unwanted samples filtered out query_line_array : array of arrays of strings there are two sub arrays the first array are made of queries to determine controls the second array are made of queries to determine cases case_controlDF : dataframe dataframe with one column named case_control. The indexs are the same as the indexs of afterExclution_MD all values are Undefined to start Returns ------- mergedMD : Metadata object Metadata object with unwanted samples filtered out and a case_control column that reflects if the index is a case, control, or Undefined ''' if verbose: print("Metadata Object has %s samples"%(afterExclusion_MD.id_count)) for key in query_line_dict: if key != 'case' and key != 'control': if verbose: print("Wrong keys used for query. Must be 'case' or 'control'.") continue #resets shrunk_MD so that filtering down to control samples does not influence filtering down to case shrunk_MD = afterExclusion_MD #get query and filtering down to control or case samples based on key query_lines = query_line_dict[key] try: ids = shrunk_MD.get_ids(' AND '.join(query_lines)) shrunk_MD = shrunk_MD.filter_ids(ids) if verbose: print("%s %s samples "%(shrunk_MD.id_count,key)) except: raise Exception('No samples fulfill %s queries. Exited while determining %s samples'%(key, key)) #replaces the true values created by the loop above to case or control ids = shrunk_MD.ids case_controlDF.loc[ids,'case_control'] = key #turns case_controlDF into a metadata object case_controlMD = Metadata(case_controlDF) #merges afterExclution_MD and case_controlMD into one new metadata object mergedMD = Metadata.merge(afterExclusion_MD, case_controlMD) return mergedMD
def determine_cases_and_controls(afterExclusion_MD, query_line_dict, case_controlDF): ''' Determines what samples are cases or controls using the queries in query_line_array. The labels of each sample are stored in case_controlDF Parameters ---------- afterExclusion_MD : Metadata object Metadata object with unwanted samples filtered out query_line_array : array of 2 arrays of strings the arrays of strings are arrays of queries the first array are make of queries to determine controls the second array are make of queries to determine cases case_controlDF : dataframe dataframe with one column named case_control. The indexs are the same as the indexs of afterExclution_MD all values are Undefined Returns ------- mergedMD : Metadata object Metadata object with unwanted samples filtered out and a case_control column that reflects if the index is a case, control, or Undefined ''' afterExclusion_MD_full = afterExclusion_MD #change case_or_control to reflect that the next loop of queries in query_lines will filter based on control queries for key in query_line_dict: if key != 'case' and key != 'control': continue #resets afterExclution_MD so that filtering down to control samples does not influence filtering down to case shrunk_MD = afterExclusion_MD_full query_lines = query_line_dict[key] try: shrunk_MD = shrunk_MD.filter_ids( shrunk_MD.get_ids(' AND '.join(query_lines))) except: print( 'No samples fulfill %s queries. Exited while determining %s samples' % (key, key)) sys.exit(1) ids = shrunk_MD.ids #replaces the true values created by the loop above to case or control case_controlDF.loc[ids, 'case_control'] = key #turns case_controlDF into a metadata object case_controlMD = Metadata(case_controlDF) index_header_name = afterExclusion_MD.id_header #merges afterExclution_MD and case_controlMD into one new metadata object mergedMD = Metadata.merge(afterExclusion_MD, case_controlMD) return mergedMD
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) # Metadata column filtering could be done in one pass, but this visualizer # displays separate warnings for non-categorical columns, and categorical # columns that didn't satisfy the requirements of the statistics being # computed. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical') non_categorical_columns = pre_filtered_cols - set(metadata.columns) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata does not contain any columns that satisfy this " "visualizer's requirements. There must be at least one metadata " "column that contains categorical data, isn't empty, doesn't " "consist of unique values, and doesn't consist of exactly one " "value.") metric_name = alpha_diversity.name # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] filtered_group_comparisons = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() initial_data_length = alpha_diversity.shape[0] data = pd.concat( [alpha_diversity, metadata_column.to_series()], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_column.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[metric_name])) escaped_column = quote(column) escaped_column = escaped_column.replace('/', '%2F') filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis( groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append([ '%s:%s' % (column, names[i]), '%s:%s' % (column, names[j]) ]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests(kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: series = pd.Series(groups, index=names) fh.write("load_data('%s'," % column) series.to_json(fh, orient='split') fh.write(",") json.dump( { 'initial': initial_data_length, 'filtered': filtered_data_length }, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = q2templates.df_to_html(kw_H_pairwise) fh.write(table.replace('\n', '').replace("'", "\\'")) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) index = os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render( index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'non_categorical_columns': ', '.join(sorted(non_categorical_columns)), 'filtered_columns': ', '.join(sorted(filtered_columns)), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons]) }) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_correlation(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata, method: str = 'spearman') -> None: try: alpha_correlation_fn = _alpha_correlation_fns[method] except KeyError: raise ValueError('Unknown alpha correlation method %s. The available ' 'options are %s.' % (method, ', '.join(_alpha_correlation_fns.keys()))) # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata contains only non-numeric or empty columns. This " "visualizer requires at least one numeric metadata column to " "execute.") # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() # create a dataframe containing the data to be correlated, and drop # any samples that have no data in either column df = pd.concat([metadata_column.to_series(), alpha_diversity], axis=1, join='inner') # compute correlation correlation_result = alpha_correlation_fn(df[metadata_column.name], df[alpha_diversity.name]) warning = None if alpha_diversity.shape[0] != df.shape[0]: warning = { 'initial': alpha_diversity.shape[0], 'method': method.title(), 'filtered': df.shape[0] } escaped_column = quote(column) filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) with open(os.path.join(output_dir, filename), 'w') as fh: fh.write("load_data('%s'," % column) df.to_json(fh, orient='split') fh.write(",") json.dump(warning, fh) fh.write(",") json.dump( { 'method': method.title(), 'testStat': '%1.4f' % correlation_result[0], 'pVal': '%1.4f' % correlation_result[1], 'sampleSize': df.shape[0] }, fh) fh.write(");") index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'filtered_columns': ', '.join(sorted(filtered_columns)) }) shutil.copytree( os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) # Metadata column filtering could be done in one pass, but this visualizer # displays separate warnings for non-categorical columns, and categorical # columns that didn't satisfy the requirements of the statistics being # computed. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical') non_categorical_columns = pre_filtered_cols - set(metadata.columns) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns( drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata does not contain any columns that satisfy this " "visualizer's requirements. There must be at least one metadata " "column that contains categorical data, isn't empty, doesn't " "consist of unique values, and doesn't consist of exactly one " "value.") metric_name = alpha_diversity.name # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] filtered_group_comparisons = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() initial_data_length = alpha_diversity.shape[0] data = pd.concat([alpha_diversity, metadata_column.to_series()], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_column.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[metric_name])) escaped_column = quote(column) escaped_column = escaped_column.replace('/', '%2F') filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis(groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append( ['%s:%s' % (column, names[i]), '%s:%s' % (column, names[j])]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests( kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: series = pd.Series(groups, index=names) fh.write("load_data('%s'," % column) series.to_json(fh, orient='split') fh.write(",") json.dump({'initial': initial_data_length, 'filtered': filtered_data_length}, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = q2templates.df_to_html(kw_H_pairwise) fh.write(table.replace('\n', '').replace("'", "\\'")) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) index = os.path.join( TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'non_categorical_columns': ', '.join(sorted(non_categorical_columns)), 'filtered_columns': ', '.join(sorted(filtered_columns)), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons])}) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_correlation(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata, method: str = 'spearman') -> None: try: alpha_correlation_fn = _alpha_correlation_fns[method] except KeyError: raise ValueError('Unknown alpha correlation method %s. The available ' 'options are %s.' % (method, ', '.join(_alpha_correlation_fns.keys()))) # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata contains only non-numeric or empty columns. This " "visualizer requires at least one numeric metadata column to " "execute.") # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() # create a dataframe containing the data to be correlated, and drop # any samples that have no data in either column df = pd.concat([metadata_column.to_series(), alpha_diversity], axis=1, join='inner') # compute correlation correlation_result = alpha_correlation_fn(df[metadata_column.name], df[alpha_diversity.name]) warning = None if alpha_diversity.shape[0] != df.shape[0]: warning = {'initial': alpha_diversity.shape[0], 'method': method.title(), 'filtered': df.shape[0]} escaped_column = quote(column) filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) with open(os.path.join(output_dir, filename), 'w') as fh: fh.write("load_data('%s'," % column) df.to_json(fh, orient='split') fh.write(",") json.dump(warning, fh) fh.write(",") json.dump({ 'method': method.title(), 'testStat': '%1.4f' % correlation_result[0], 'pVal': '%1.4f' % correlation_result[1], 'sampleSize': df.shape[0]}, fh) fh.write(");") index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'filtered_columns': ', '.join(sorted(filtered_columns))}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dist'), os.path.join(output_dir, 'dist'))
def determine_cases_and_controls(afterExclusion_MD, query_line_dict, extra): ''' Determines what samples are cases or controls using the queries in query_line_array. The labels of each sample are stored in case_controlDF Parameters ---------- afterExclusion_MD : Metadata object Metadata object with unwanted samples filtered out query_line_array : array of arrays of strings there are two sub arrays the first array are made of queries to determine controls the second array are made of queries to determine cases extra: boolean Tells function whether to shrink metadata in one step or in multiple extra print statements that show how many potential case or control samples are left after each query Returns ------- mergedMD : Metadata object Metadata object with unwanted samples filtered out and a case_control column that reflects if the index is a case, control, or Undefined Raises ------ ValueError If the input file of sql commands for determining case and controls is empty ''' ids = afterExclusion_MD.get_ids() case_control_Series = pd.Series(["Unspecified"] * len(ids), ids) case_control_Series.index.name = afterExclusion_MD.id_header case_controlDF = case_control_Series.to_frame("case_control") print("Metadata Object has %s samples" % (afterExclusion_MD.id_count)) for key in query_line_dict: if key != "case" and key != "control": print("Wrong key used for query. Must be 'case' or 'control'.") continue #resets shrunk_MD so that filtering down to control samples does not #influence filtering down to case shrunk_MD = afterExclusion_MD #get query and filtering down to control or case samples based on key query_lines = query_line_dict[key] if len(query_lines) < 1: raise ValueError("The %s query file is empty" % (key)) if extra: for line in query_lines: initial_size = shrunk_MD.id_count ids = shrunk_MD.get_ids(line) shrunk_MD = shrunk_MD.filter_ids(ids) print(line) print( "\tFilters down number of potental %s samples left to %s" % (key, shrunk_MD.id_count)) else: ids = shrunk_MD.get_ids(' AND '.join(query_lines)) shrunk_MD = shrunk_MD.filter_ids(ids) print("Final number of %s samples is %s" % (shrunk_MD.id_count, key)) #replaces the true values created by the loop above to case or control ids = shrunk_MD.ids case_controlDF.loc[ids, "case_control"] = key #turns case_controlDF into a metadata object case_controlMD = Metadata(case_controlDF) #merges afterExclution_MD and case_controlMD into one new metadata object mergedMD = Metadata.merge(afterExclusion_MD, case_controlMD) return mergedMD