def merge(table1: biom.Table, table2: biom.Table) -> biom.Table: table1_sids = set(table1.ids(axis='sample')) table2_sids = set(table2.ids(axis='sample')) if len(table1_sids & table2_sids) > 0: raise ValueError('Some samples are present in both tables: %s' % ', '.join(table1_sids & table2_sids)) return table1.merge(table2)
def pad_features_in_test_data(train_table: biom.Table, test_table: biom.Table) -> biom.Table: ''' Do feature alignment on train and test tables by adding zero-padding features that only existed in the train table into test table. Parameters ---------- train_table: biom.Table A biom table with train data test_table: biom.Table A biom table with test data Returns ------- new_test_biom: biom.Table A biom table with the updated test data with identical set of features in the train table. ''' train_feature_ids = train_table.ids(axis='observation') test_feature_ids = test_table.ids(axis='observation') n_samples = test_table.shape[0] #n_features = test_table.shape[1] sample_ids= test_table.ids(axis='sample') #print("The # of features in the train data: ", len(train_feature_ids)) #print("The # of features in the original test data: ", len(test_feature_ids)) train_uniq_f=list(set(train_feature_ids)-set(test_feature_ids)) shared_f=set(train_feature_ids).intersection(set(test_feature_ids)) # create a zero matrix for all features uniquely existed in the train table padding_table = biom.Table(np.zeros((len(train_uniq_f), n_samples)), train_uniq_f, sample_ids) # filter out features that don't exist in the train table in the test table test_table.filter(shared_f, axis='observation') n_filtered_features = test_table.shape[1] if n_filtered_features == 0: raise ValueError('No feature overlap between train and test table!' 'Check the feature-format consistentcy between tables!') # merge the two tables new_test_table = test_table.merge(padding_table) return new_test_table
def cluster_features(query_table: biom.Table, closed_reference_table: biom.Table, query_sequences: DNAFASTAFormat, reference_sequences: pd.Series, thr: float = 0.97, threads: int = 1, output_log_file: str = None) -> ( biom.Table, DNAFASTAFormat, DNAFASTAFormat): reference_sequences_fasta = get_reference_seqs_from_ids(closed_reference_table, reference_sequences) results = cluster_features_closed_reference(sequences=query_sequences, table=query_table, reference_sequences=reference_sequences_fasta, perc_identity=thr, threads=threads) clustered_table_biom = results[0] clustered_sequences_pd = Artifact.load(str(results[1])).view(pd.Series) unmatched_sequences_pd = Artifact.load(str(results[2])).view(pd.Series) with tempfile.mktemp() as tmp_fp: logger_ins = LOG(tmp_fp).get_logger('clustering_features') logger_ins.info("The number of OTUs in the reference database is", _15(reference_sequences_fasta).size) logger_ins.info("The number of unmatched sequence to the reference alignment is", unmatched_sequences_pd.size) logger_ins.info("The number of matched sequences to the reference alignment is", clustered_sequences_pd.size) logger_ins.info("Before applying clustering, the total number of counts " "in the original feature table was", np.sum(query_table.sum())) logger_ins.info("Before applying clustering, the number of non-zero elements" " of the underlying feature table is", query_table.nnz) logger_ins.info("After applying clustering, the total number of counts " "in the original feature table was", np.sum(clustered_table_biom.sum())) logger_ins.info("After applying clustering, the number of non-zero elements" " of the underlying feature table is", clustered_table_biom.nnz) logger_ins.info("The percent of total counts retained is", np.sum(query_table.sum()) / np.sum(clustered_table_biom.sum()) * 100, "%s") query_samples = clustered_table_biom.ids('sample') closed_reference_features = closed_reference_table.ids('observation') clustered_table_biom = closed_reference_table.merge(clustered_table_biom) clustered_table_biom.filter(ids_to_keep=query_samples, axis='sample', inplace=True) if len(set(closed_reference_features) - set(clustered_table_biom.ids('sample'))) != 0: raise ValueError( "Merging two tables failed! There are less features in the final table than expected!" ) if output_log_file: shutil.copy(tmp_fp, output_log_file) return clustered_table_biom, results[1], results[2]
def reorder_feature_table(query_table: biom.Table, reference_table: biom.Table) -> biom.Table: query_samples = query_table.ids() ref_samples = set(reference_table.ids()) for sample in query_samples: if sample in ref_samples: raise ValueError( "The sample", sample, "from your reference data found in your query one, while " "the two tables should be disjoint." ) merged_table = reference_table.merge(query_table) merged_table.filter(ids_to_keep=reference_table.ids('observation')) merged_table.filter(ids_to_keep=query_samples) for sample in merged_table.ids(): if sample in ref_samples: raise ValueError( "The sample", sample, "from your reference data found in your query one, while " "the two tables should be disjoint." ) return merged_table.sort_order(reference_table.ids('observation'))
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table): """Creates the initial non-rarefied BIOM artifact of the analysis Parameters ---------- analysis : dict Dictionary with the analysis information biom_data : dict Dictionary with the biom file information rarefied_table : biom.Table The rarefied BIOM table Returns ------- int The id of the new artifact """ # The non rarefied biom artifact is the initial biom table of the analysis. # This table does not currently exist anywhere, so we need to actually # create the BIOM file. To create this BIOM file we need: (1) the samples # and artifacts they come from and (2) whether the samples where # renamed or not. (1) is on the database, but we need to inferr (2) from # the existing rarefied BIOM table. Fun, fun... with TRN: # Get the samples included in the BIOM table grouped by artifact id # Note that the analysis contains a BIOM table per data type included # in it, and the table analysis_sample does not differentiate between # datatypes, so we need to check the data type in the artifact table sql = """SELECT artifact_id, array_agg(sample_id) FROM qiita.analysis_sample JOIN qiita.artifact USING (artifact_id) WHERE analysis_id = %s AND data_type_id = %s GROUP BY artifact_id""" TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']]) samples_by_artifact = TRN.execute_fetchindex() # Create an empty BIOM table to be the new master table new_table = Table([], [], []) ids_map = {} for a_id, samples in samples_by_artifact: # Get the filepath of the BIOM table from the artifact artifact = Artifact(a_id) biom_fp = None for _, fp, fp_type in artifact.filepaths: if fp_type == 'biom': biom_fp = fp # Note that we are sure that the biom table exists for sure, so # no need to check if biom_fp is undefined biom_table = load_table(biom_fp) samples = set(samples).intersection(biom_table.ids()) biom_table.filter(samples, axis='sample', inplace=True) # we need to check if the table has samples left before merging if biom_table.shape[0] != 0 and biom_table.shape[1] != 0: new_table = new_table.merge(biom_table) ids_map.update({sid: "%d.%s" % (a_id, sid) for sid in biom_table.ids()}) # Check if we need to rename the sample ids in the biom table new_table_ids = set(new_table.ids()) if not new_table_ids.issuperset(rarefied_table.ids()): # We need to rename the sample ids new_table.update_ids(ids_map, 'sample', True, True) sql = """INSERT INTO qiita.artifact (generated_timestamp, data_type_id, visibility_id, artifact_type_id, submitted_to_vamps) VALUES (%s, %s, %s, %s, %s) RETURNING artifact_id""" # Magic number 4 -> visibility sandbox # Magix number 7 -> biom artifact type TRN.add(sql, [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False]) artifact_id = TRN.execute_fetchlast() # Associate the artifact with the analysis sql = """INSERT INTO qiita.analysis_artifact (analysis_id, artifact_id) VALUES (%s, %s)""" TRN.add(sql, [analysis['analysis_id'], artifact_id]) # Link the artifact with its file dd_id, mp = get_mountpoint('BIOM')[0] dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id)) if not exists(dir_fp): makedirs(dir_fp) new_table_fp = join(dir_fp, "biom_table.biom") with biom_open(new_table_fp, 'w') as f: new_table.to_hdf5(f, "Generated by Qiita") sql = """INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES (%s, %s, %s, %s, %s) RETURNING filepath_id""" # Magic number 7 -> filepath_type_id = 'biom' # Magic number 1 -> the checksum algorithm id TRN.add(sql, [basename(new_table_fp), 7, compute_checksum(new_table_fp), 1, dd_id]) fp_id = TRN.execute_fetchlast() sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id) VALUES (%s, %s)""" TRN.add(sql, [artifact_id, fp_id]) TRN.execute() return artifact_id
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table): """Creates the initial non-rarefied BIOM artifact of the analysis Parameters ---------- analysis : dict Dictionary with the analysis information biom_data : dict Dictionary with the biom file information rarefied_table : biom.Table The rarefied BIOM table Returns ------- int The id of the new artifact """ # The non rarefied biom artifact is the initial biom table of the analysis. # This table does not currently exist anywhere, so we need to actually # create the BIOM file. To create this BIOM file we need: (1) the samples # and artifacts they come from and (2) whether the samples where # renamed or not. (1) is on the database, but we need to inferr (2) from # the existing rarefied BIOM table. Fun, fun... with TRN: # Get the samples included in the BIOM table grouped by artifact id # Note that the analysis contains a BIOM table per data type included # in it, and the table analysis_sample does not differentiate between # datatypes, so we need to check the data type in the artifact table sql = """SELECT artifact_id, array_agg(sample_id) FROM qiita.analysis_sample JOIN qiita.artifact USING (artifact_id) WHERE analysis_id = %s AND data_type_id = %s GROUP BY artifact_id""" TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']]) samples_by_artifact = TRN.execute_fetchindex() # Create an empty BIOM table to be the new master table new_table = Table([], [], []) ids_map = {} for a_id, samples in samples_by_artifact: # Get the filepath of the BIOM table from the artifact artifact = Artifact(a_id) biom_fp = None for _, fp, fp_type in artifact.filepaths: if fp_type == 'biom': biom_fp = fp # Note that we are sure that the biom table exists for sure, so # no need to check if biom_fp is undefined biom_table = load_table(biom_fp) samples = set(samples).intersection(biom_table.ids()) biom_table.filter(samples, axis='sample', inplace=True) # we need to check if the table has samples left before merging if biom_table.shape[0] != 0 and biom_table.shape[1] != 0: new_table = new_table.merge(biom_table) ids_map.update( {sid: "%d.%s" % (a_id, sid) for sid in biom_table.ids()}) # Check if we need to rename the sample ids in the biom table new_table_ids = set(new_table.ids()) if not new_table_ids.issuperset(rarefied_table.ids()): # We need to rename the sample ids new_table.update_ids(ids_map, 'sample', True, True) sql = """INSERT INTO qiita.artifact (generated_timestamp, data_type_id, visibility_id, artifact_type_id, submitted_to_vamps) VALUES (%s, %s, %s, %s, %s) RETURNING artifact_id""" # Magic number 4 -> visibility sandbox # Magix number 7 -> biom artifact type TRN.add( sql, [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False]) artifact_id = TRN.execute_fetchlast() # Associate the artifact with the analysis sql = """INSERT INTO qiita.analysis_artifact (analysis_id, artifact_id) VALUES (%s, %s)""" TRN.add(sql, [analysis['analysis_id'], artifact_id]) # Link the artifact with its file dd_id, mp = get_mountpoint('BIOM')[0] dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id)) if not exists(dir_fp): makedirs(dir_fp) new_table_fp = join(dir_fp, "biom_table.biom") with biom_open(new_table_fp, 'w') as f: new_table.to_hdf5(f, "Generated by Qiita") sql = """INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES (%s, %s, %s, %s, %s) RETURNING filepath_id""" # Magic number 7 -> filepath_type_id = 'biom' # Magic number 1 -> the checksum algorithm id TRN.add(sql, [ basename(new_table_fp), 7, compute_checksum(new_table_fp), 1, dd_id ]) fp_id = TRN.execute_fetchlast() sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id) VALUES (%s, %s)""" TRN.add(sql, [artifact_id, fp_id]) TRN.execute() return artifact_id