Esempio n. 1
0
    def __process_base(
            self,
            user_id,
            pid,
            base_arr,
            output_raw_otu_file_name=RAW_GENE_TABLE_FILENAME,
            output_raw_otu_labels_file_name=RAW_GENE_TABLE_LABELS_FILENAME):
        '''
        Takes a TSV-separated base OTU file and extracts the header and the sample labels from the OTU file.
        Removes unnecessary columns if input is mothur derived file.
        Returns an OTU file (with only numeric values) and corresponding table header and sample labels
        :param user_id:
        :param pid:
        :return:
        '''

        project_dir = os.path.join(ProjectManager.DATA_DIRECTORY, user_id)
        project_dir = os.path.join(project_dir, pid)
        raw_table_path = os.path.join(project_dir, output_raw_otu_file_name)

        base = []

        matrix_type = "int"

        # Transpose the input base array to match the internal Mian engine format
        col_offset = 1
        row_offset = 1
        col = col_offset
        while col < len(base_arr[0]):
            new_row = []
            row = row_offset
            while row < len(base_arr):
                if base_arr[row][col] == "":
                    # Empty values will default to zero
                    new_row.append(0)
                else:
                    val = float(base_arr[row][col])
                    if val.is_integer():
                        new_row.append(int(float(base_arr[row][col])))
                    else:
                        new_row.append(float(base_arr[row][col]))
                        matrix_type = "float"
                row += 1
            base.append(new_row)
            col += 1

        headers = []
        for row in base_arr:
            headers.append(row[0])

        sample_labels = base_arr[0][col_offset:]

        labels = [headers, sample_labels]

        DataIO.table_to_tsv(base, user_id, pid, raw_table_path)
        DataIO.table_to_tsv(labels, user_id, pid,
                            output_raw_otu_labels_file_name)

        return base, headers, sample_labels, matrix_type
    def get_otu_table_headers_at_taxonomic_level(user_id, pid, level, use_raw=False):
        logger.info("Using raw data")
        labels = DataIO.tsv_to_table(user_id, pid, RAW_GENE_TABLE_LABELS_FILENAME)
        headers = labels[0]

        if int(level) == -1:
            # OTUs requested
            return headers

        taxonomy = Taxonomy(user_id, pid)
        taxonomy_map = taxonomy.get_taxonomy_map()
        taxonomies = []
        taxonomy_to_cols = {}
        j = 0
        while j < len(headers):
            otu = headers[j]
            if otu in taxonomy_map:
                # Uncomment below if we want to use the fully quantified taxonomy string
                # taxonomy = "; ".join(taxonomy_map[otu][:int(level) + 1])
                taxonomy = taxonomy_map[otu][int(level)]
                if taxonomy != "":
                    if taxonomy in taxonomy_to_cols:
                        taxonomy_to_cols[taxonomy].append(j)
                    else:
                        taxonomy_to_cols[taxonomy] = [j]
                        taxonomies.append(taxonomy)
            j += 1
        return taxonomies
Esempio n. 3
0
 def __load_metadata_samples(self):
     """
     Loads a metadata file into memory
     :return:
     """
     self.metadata = DataIO.tsv_to_table(self.user_id, self.pid,
                                         SAMPLE_METADATA_FILENAME)
    def load_otu_table(self, user_id, pid, use_raw, use_np):
        self.user_id = user_id
        self.pid = pid
        logger.info("Before load")
        self.sample_metadata = Metadata(user_id, pid)
        logger.info("Finished metadata loading")
        self.otu_metadata = Taxonomy(user_id, pid)
        logger.info("Finished taxonomy loading")

        logger.info("Using raw data")
        if use_np:
            self.table = DataIO.tsv_to_np_table(self.user_id, self.pid, RAW_GENE_TABLE_FILENAME)
        else:
            self.table = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_FILENAME)
        labels = DataIO.tsv_to_table(self.user_id, self.pid, RAW_GENE_TABLE_LABELS_FILENAME)
        self.headers = labels[0]
        self.sample_labels = labels[1]
    def test__create_project_from_tsv(self):
        project_manager = ProjectManager("unit_tests")

        unit_tests_dir = os.path.join(TestProjectManager.UNIT_TESTS_DIRECTORY,
                                      "small_biom")
        test_staging_dir = os.path.join(TestProjectManager.STAGING_DIRECTORY,
                                        "unit_tests")
        if not os.path.exists(test_staging_dir):
            os.makedirs(test_staging_dir)

        shutil.copyfile(os.path.join(unit_tests_dir, "table.raw.tsv"),
                        os.path.join(test_staging_dir, "table.raw.tsv"))
        shutil.copyfile(os.path.join(unit_tests_dir, "taxonomy.tsv"),
                        os.path.join(test_staging_dir, "taxonomy.tsv"))
        shutil.copyfile(os.path.join(unit_tests_dir, "sample_metadata.tsv"),
                        os.path.join(test_staging_dir, "sample_metadata.tsv"))

        status, pid = project_manager.stage_project_from_tsv(
            "tmp_project", "table.raw.tsv", "taxonomy.tsv",
            "sample_metadata.tsv", "")
        project_manager.create_project(pid, "", "", [])

        test_project_dir = os.path.join(
            TestProjectManager.UNIT_TESTS_DIRECTORY, pid)
        self.assertTrue(os.path.exists(test_project_dir))

        subsampled_table = DataIO.tsv_to_np_table("unit_tests", pid,
                                                  "table.subsampled.tsv")
        self.assertEqual(6, len(subsampled_table))

        r = 0
        while r < len(subsampled_table):
            self.assertEqual(30, np.sum(subsampled_table[r]))
            r += 1

        subsampled_table_labels = DataIO.tsv_to_table(
            "unit_tests", pid, "table.subsampled.labels.tsv")
        self.assertEqual(2, len(subsampled_table_labels))
        self.assertEqual(5, len(subsampled_table_labels[0]))
        self.assertEqual(6, len(subsampled_table_labels[1]))

        shutil.rmtree(test_project_dir)
Esempio n. 6
0
 def get_file_for_download(self, project_name, type):
     if type == "sample_metadata":
         return DataIO.tsv_to_table(self.user_id, project_name,
                                    SAMPLE_METADATA_FILENAME)
     elif type == "otu":
         table = DataIO.tsv_to_table(self.user_id, project_name,
                                     RAW_GENE_TABLE_FILENAME)
         labels = DataIO.tsv_to_table(self.user_id, project_name,
                                      RAW_GENE_TABLE_LABELS_FILENAME)
         new_headers = ["Sample Labels"]
         new_headers.extend(labels[0])
         full_table = [new_headers]
         i = 0
         while i < len(table):
             new_row = [labels[1][i] if i < len(labels[1]) else ""]
             new_row.extend(table[i])
             full_table.append(new_row)
             i += 1
         return full_table
     else:
         return []
Esempio n. 7
0
    def stage_project_from_tsv(self, project_name, otu_filename,
                               sample_metadata_filename):
        # Creates a directory for this project
        pid = str(uuid.uuid4())
        project_dir = os.path.join(ProjectManager.DATA_DIRECTORY, self.user_id)
        project_dir = os.path.join(project_dir, pid)
        if not os.path.exists(project_dir):
            os.makedirs(project_dir)
        else:
            logger.exception("Cannot create project folder")
            raise Exception(
                "Cannot create project folder as it already exists")

        # Renames the uploaded files to a standard file schema and moves to the project directory
        user_staging_dir = os.path.join(ProjectManager.STAGING_DIRECTORY,
                                        self.user_id)
        os.rename(os.path.join(user_staging_dir, sample_metadata_filename),
                  os.path.join(project_dir, SAMPLE_METADATA_FILENAME))

        sample_ids_from_sample_metadata = {}
        sample_metadata = DataIO.tsv_to_table(self.user_id,
                                              pid,
                                              SAMPLE_METADATA_FILENAME,
                                              accept_empty_headers=False)

        i = 0
        while i < len(sample_metadata):
            if i > 0:
                if len(sample_metadata[i]) > 0:
                    sample_ids_from_sample_metadata[sample_metadata[i][0]] = 1
            i += 1

        logger.info("Beginning to load the OTU table")
        base_arr = DataIO.tsv_to_table_from_path(os.path.join(
            user_staging_dir, otu_filename),
                                                 accept_empty_headers=False)

        # Processes the uploaded OTU file by removing unnecessary columns and extracting the headers and sample labels
        try:
            logger.info("Beginning to process the OTU table")
            base, headers, sample_labels, matrix_type = self.__process_base(
                self.user_id, pid, base_arr)
        except ValueError:
            logger.exception("OTU file contains non-integers")
            # Removes the project directory since the files in it are invalid
            shutil.rmtree(project_dir, ignore_errors=True)
            return OTU_DATATYPE_ERROR, ""
        except:
            logger.exception("Invalid OTU file format")
            # Removes the project directory since the files in it are invalid
            shutil.rmtree(project_dir, ignore_errors=True)
            return OTU_ERROR, ""

        # Creates map.txt file
        logger.info("Creating the map.txt file")
        map_file = Map(self.user_id, pid)
        map_file.project_name = project_name
        map_file.orig_otu_table_name = otu_filename
        map_file.orig_sample_metadata_name = sample_metadata_filename
        map_file.matrix_type = matrix_type
        map_file.num_samples = len(sample_labels)
        map_file.num_otus = len(headers)
        map_file.save()

        return OK, pid
Esempio n. 8
0
 def __load_taxonomy(self):
     tax = DataIO.tsv_to_table(self.user_id, self.pid,
                               RAW_GENE_TABLE_LABELS_FILENAME)
     headers = tax[0]
     self.taxonomy_map = self.__get_taxonomy_mapping_from_dict(headers)