def __process_base( self, user_id, pid, base_arr, output_raw_otu_file_name=RAW_GENE_TABLE_FILENAME, output_raw_otu_labels_file_name=RAW_GENE_TABLE_LABELS_FILENAME): ''' Takes a TSV-separated base OTU file and extracts the header and the sample labels from the OTU file. Removes unnecessary columns if input is mothur derived file. Returns an OTU file (with only numeric values) and corresponding table header and sample labels :param user_id: :param pid: :return: ''' project_dir = os.path.join(ProjectManager.DATA_DIRECTORY, user_id) project_dir = os.path.join(project_dir, pid) raw_table_path = os.path.join(project_dir, output_raw_otu_file_name) base = [] matrix_type = "int" # Transpose the input base array to match the internal Mian engine format col_offset = 1 row_offset = 1 col = col_offset while col < len(base_arr[0]): new_row = [] row = row_offset while row < len(base_arr): if base_arr[row][col] == "": # Empty values will default to zero new_row.append(0) else: val = float(base_arr[row][col]) if val.is_integer(): new_row.append(int(float(base_arr[row][col]))) else: new_row.append(float(base_arr[row][col])) matrix_type = "float" row += 1 base.append(new_row) col += 1 headers = [] for row in base_arr: headers.append(row[0]) sample_labels = base_arr[0][col_offset:] labels = [headers, sample_labels] DataIO.table_to_tsv(base, user_id, pid, raw_table_path) DataIO.table_to_tsv(labels, user_id, pid, output_raw_otu_labels_file_name) return base, headers, sample_labels, matrix_type
def get_otu_table_headers_at_taxonomic_level(user_id, pid, level, use_raw=False): logger.info("Using raw data") labels = DataIO.tsv_to_table(user_id, pid, RAW_GENE_TABLE_LABELS_FILENAME) headers = labels[0] if int(level) == -1: # OTUs requested return headers taxonomy = Taxonomy(user_id, pid) taxonomy_map = taxonomy.get_taxonomy_map() taxonomies = [] taxonomy_to_cols = {} j = 0 while j < len(headers): otu = headers[j] if otu in taxonomy_map: # Uncomment below if we want to use the fully quantified taxonomy string # taxonomy = "; ".join(taxonomy_map[otu][:int(level) + 1]) taxonomy = taxonomy_map[otu][int(level)] if taxonomy != "": if taxonomy in taxonomy_to_cols: taxonomy_to_cols[taxonomy].append(j) else: taxonomy_to_cols[taxonomy] = [j] taxonomies.append(taxonomy) j += 1 return taxonomies
def __load_metadata_samples(self): """ Loads a metadata file into memory :return: """ self.metadata = DataIO.tsv_to_table(self.user_id, self.pid, SAMPLE_METADATA_FILENAME)
def load_otu_table(self, user_id, pid, use_raw, use_np): self.user_id = user_id self.pid = pid logger.info("Before load") self.sample_metadata = Metadata(user_id, pid) logger.info("Finished metadata loading") self.otu_metadata = Taxonomy(user_id, pid) logger.info("Finished taxonomy loading") logger.info("Using raw data") if use_np: self.table = DataIO.tsv_to_np_table(self.user_id, self.pid, RAW_GENE_TABLE_FILENAME) else: self.table = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_FILENAME) labels = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_LABELS_FILENAME) self.headers = labels[0] self.sample_labels = labels[1]
def test__create_project_from_tsv(self): project_manager = ProjectManager("unit_tests") unit_tests_dir = os.path.join(TestProjectManager.UNIT_TESTS_DIRECTORY, "small_biom") test_staging_dir = os.path.join(TestProjectManager.STAGING_DIRECTORY, "unit_tests") if not os.path.exists(test_staging_dir): os.makedirs(test_staging_dir) shutil.copyfile(os.path.join(unit_tests_dir, "table.raw.tsv"), os.path.join(test_staging_dir, "table.raw.tsv")) shutil.copyfile(os.path.join(unit_tests_dir, "taxonomy.tsv"), os.path.join(test_staging_dir, "taxonomy.tsv")) shutil.copyfile(os.path.join(unit_tests_dir, "sample_metadata.tsv"), os.path.join(test_staging_dir, "sample_metadata.tsv")) status, pid = project_manager.stage_project_from_tsv( "tmp_project", "table.raw.tsv", "taxonomy.tsv", "sample_metadata.tsv", "") project_manager.create_project(pid, "", "", []) test_project_dir = os.path.join( TestProjectManager.UNIT_TESTS_DIRECTORY, pid) self.assertTrue(os.path.exists(test_project_dir)) subsampled_table = DataIO.tsv_to_np_table("unit_tests", pid, "table.subsampled.tsv") self.assertEqual(6, len(subsampled_table)) r = 0 while r < len(subsampled_table): self.assertEqual(30, np.sum(subsampled_table[r])) r += 1 subsampled_table_labels = DataIO.tsv_to_table( "unit_tests", pid, "table.subsampled.labels.tsv") self.assertEqual(2, len(subsampled_table_labels)) self.assertEqual(5, len(subsampled_table_labels[0])) self.assertEqual(6, len(subsampled_table_labels[1])) shutil.rmtree(test_project_dir)
def get_file_for_download(self, project_name, type): if type == "sample_metadata": return DataIO.tsv_to_table(self.user_id, project_name, SAMPLE_METADATA_FILENAME) elif type == "otu": table = DataIO.tsv_to_table(self.user_id, project_name, RAW_GENE_TABLE_FILENAME) labels = DataIO.tsv_to_table(self.user_id, project_name, RAW_GENE_TABLE_LABELS_FILENAME) new_headers = ["Sample Labels"] new_headers.extend(labels[0]) full_table = [new_headers] i = 0 while i < len(table): new_row = [labels[1][i] if i < len(labels[1]) else ""] new_row.extend(table[i]) full_table.append(new_row) i += 1 return full_table else: return []
def stage_project_from_tsv(self, project_name, otu_filename, sample_metadata_filename): # Creates a directory for this project pid = str(uuid.uuid4()) project_dir = os.path.join(ProjectManager.DATA_DIRECTORY, self.user_id) project_dir = os.path.join(project_dir, pid) if not os.path.exists(project_dir): os.makedirs(project_dir) else: logger.exception("Cannot create project folder") raise Exception( "Cannot create project folder as it already exists") # Renames the uploaded files to a standard file schema and moves to the project directory user_staging_dir = os.path.join(ProjectManager.STAGING_DIRECTORY, self.user_id) os.rename(os.path.join(user_staging_dir, sample_metadata_filename), os.path.join(project_dir, SAMPLE_METADATA_FILENAME)) sample_ids_from_sample_metadata = {} sample_metadata = DataIO.tsv_to_table(self.user_id, pid, SAMPLE_METADATA_FILENAME, accept_empty_headers=False) i = 0 while i < len(sample_metadata): if i > 0: if len(sample_metadata[i]) > 0: sample_ids_from_sample_metadata[sample_metadata[i][0]] = 1 i += 1 logger.info("Beginning to load the OTU table") base_arr = DataIO.tsv_to_table_from_path(os.path.join( user_staging_dir, otu_filename), accept_empty_headers=False) # Processes the uploaded OTU file by removing unnecessary columns and extracting the headers and sample labels try: logger.info("Beginning to process the OTU table") base, headers, sample_labels, matrix_type = self.__process_base( self.user_id, pid, base_arr) except ValueError: logger.exception("OTU file contains non-integers") # Removes the project directory since the files in it are invalid shutil.rmtree(project_dir, ignore_errors=True) return OTU_DATATYPE_ERROR, "" except: logger.exception("Invalid OTU file format") # Removes the project directory since the files in it are invalid shutil.rmtree(project_dir, ignore_errors=True) return OTU_ERROR, "" # Creates map.txt file logger.info("Creating the map.txt file") map_file = Map(self.user_id, pid) map_file.project_name = project_name map_file.orig_otu_table_name = otu_filename map_file.orig_sample_metadata_name = sample_metadata_filename map_file.matrix_type = matrix_type map_file.num_samples = len(sample_labels) map_file.num_otus = len(headers) map_file.save() return OK, pid
def __load_taxonomy(self): tax = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_LABELS_FILENAME) headers = tax[0] self.taxonomy_map = self.__get_taxonomy_mapping_from_dict(headers)