def __load_metadata_samples(self): """ Loads a metadata file into memory :return: """ self.metadata = DataIO.tsv_to_table(self.user_id, self.pid, SAMPLE_METADATA_FILENAME)
def get_otu_table_headers_at_taxonomic_level(user_id, pid, level, use_raw=False): logger.info("Using raw data") labels = DataIO.tsv_to_table(user_id, pid, RAW_GENE_TABLE_LABELS_FILENAME) headers = labels[0] if int(level) == -1: # OTUs requested return headers taxonomy = Taxonomy(user_id, pid) taxonomy_map = taxonomy.get_taxonomy_map() taxonomies = [] taxonomy_to_cols = {} j = 0 while j < len(headers): otu = headers[j] if otu in taxonomy_map: # Uncomment below if we want to use the fully quantified taxonomy string # taxonomy = "; ".join(taxonomy_map[otu][:int(level) + 1]) taxonomy = taxonomy_map[otu][int(level)] if taxonomy != "": if taxonomy in taxonomy_to_cols: taxonomy_to_cols[taxonomy].append(j) else: taxonomy_to_cols[taxonomy] = [j] taxonomies.append(taxonomy) j += 1 return taxonomies
def load_otu_table(self, user_id, pid, use_raw, use_np): self.user_id = user_id self.pid = pid logger.info("Before load") self.sample_metadata = Metadata(user_id, pid) logger.info("Finished metadata loading") self.otu_metadata = Taxonomy(user_id, pid) logger.info("Finished taxonomy loading") logger.info("Using raw data") if use_np: self.table = DataIO.tsv_to_np_table(self.user_id, self.pid, RAW_GENE_TABLE_FILENAME) else: self.table = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_FILENAME) labels = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_LABELS_FILENAME) self.headers = labels[0] self.sample_labels = labels[1]
def get_file_for_download(self, project_name, type): if type == "sample_metadata": return DataIO.tsv_to_table(self.user_id, project_name, SAMPLE_METADATA_FILENAME) elif type == "otu": table = DataIO.tsv_to_table(self.user_id, project_name, RAW_GENE_TABLE_FILENAME) labels = DataIO.tsv_to_table(self.user_id, project_name, RAW_GENE_TABLE_LABELS_FILENAME) new_headers = ["Sample Labels"] new_headers.extend(labels[0]) full_table = [new_headers] i = 0 while i < len(table): new_row = [labels[1][i] if i < len(labels[1]) else ""] new_row.extend(table[i]) full_table.append(new_row) i += 1 return full_table else: return []
def test__create_project_from_tsv(self): project_manager = ProjectManager("unit_tests") unit_tests_dir = os.path.join(TestProjectManager.UNIT_TESTS_DIRECTORY, "small_biom") test_staging_dir = os.path.join(TestProjectManager.STAGING_DIRECTORY, "unit_tests") if not os.path.exists(test_staging_dir): os.makedirs(test_staging_dir) shutil.copyfile(os.path.join(unit_tests_dir, "table.raw.tsv"), os.path.join(test_staging_dir, "table.raw.tsv")) shutil.copyfile(os.path.join(unit_tests_dir, "taxonomy.tsv"), os.path.join(test_staging_dir, "taxonomy.tsv")) shutil.copyfile(os.path.join(unit_tests_dir, "sample_metadata.tsv"), os.path.join(test_staging_dir, "sample_metadata.tsv")) status, pid = project_manager.stage_project_from_tsv( "tmp_project", "table.raw.tsv", "taxonomy.tsv", "sample_metadata.tsv", "") project_manager.create_project(pid, "", "", []) test_project_dir = os.path.join( TestProjectManager.UNIT_TESTS_DIRECTORY, pid) self.assertTrue(os.path.exists(test_project_dir)) subsampled_table = DataIO.tsv_to_np_table("unit_tests", pid, "table.subsampled.tsv") self.assertEqual(6, len(subsampled_table)) r = 0 while r < len(subsampled_table): self.assertEqual(30, np.sum(subsampled_table[r])) r += 1 subsampled_table_labels = DataIO.tsv_to_table( "unit_tests", pid, "table.subsampled.labels.tsv") self.assertEqual(2, len(subsampled_table_labels)) self.assertEqual(5, len(subsampled_table_labels[0])) self.assertEqual(6, len(subsampled_table_labels[1])) shutil.rmtree(test_project_dir)
def stage_project_from_tsv(self, project_name, otu_filename, sample_metadata_filename): # Creates a directory for this project pid = str(uuid.uuid4()) project_dir = os.path.join(ProjectManager.DATA_DIRECTORY, self.user_id) project_dir = os.path.join(project_dir, pid) if not os.path.exists(project_dir): os.makedirs(project_dir) else: logger.exception("Cannot create project folder") raise Exception( "Cannot create project folder as it already exists") # Renames the uploaded files to a standard file schema and moves to the project directory user_staging_dir = os.path.join(ProjectManager.STAGING_DIRECTORY, self.user_id) os.rename(os.path.join(user_staging_dir, sample_metadata_filename), os.path.join(project_dir, SAMPLE_METADATA_FILENAME)) sample_ids_from_sample_metadata = {} sample_metadata = DataIO.tsv_to_table(self.user_id, pid, SAMPLE_METADATA_FILENAME, accept_empty_headers=False) i = 0 while i < len(sample_metadata): if i > 0: if len(sample_metadata[i]) > 0: sample_ids_from_sample_metadata[sample_metadata[i][0]] = 1 i += 1 logger.info("Beginning to load the OTU table") base_arr = DataIO.tsv_to_table_from_path(os.path.join( user_staging_dir, otu_filename), accept_empty_headers=False) # Processes the uploaded OTU file by removing unnecessary columns and extracting the headers and sample labels try: logger.info("Beginning to process the OTU table") base, headers, sample_labels, matrix_type = self.__process_base( self.user_id, pid, base_arr) except ValueError: logger.exception("OTU file contains non-integers") # Removes the project directory since the files in it are invalid shutil.rmtree(project_dir, ignore_errors=True) return OTU_DATATYPE_ERROR, "" except: logger.exception("Invalid OTU file format") # Removes the project directory since the files in it are invalid shutil.rmtree(project_dir, ignore_errors=True) return OTU_ERROR, "" # Creates map.txt file logger.info("Creating the map.txt file") map_file = Map(self.user_id, pid) map_file.project_name = project_name map_file.orig_otu_table_name = otu_filename map_file.orig_sample_metadata_name = sample_metadata_filename map_file.matrix_type = matrix_type map_file.num_samples = len(sample_labels) map_file.num_otus = len(headers) map_file.save() return OK, pid
def __load_taxonomy(self): tax = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_LABELS_FILENAME) headers = tax[0] self.taxonomy_map = self.__get_taxonomy_mapping_from_dict(headers)