Esempio n. 1
0
    def starfile2df(cls, filepath, data_folder=None, max_rows=None):
        if data_folder is not None:
            if not os.path.isabs(data_folder):
                data_folder = os.path.join(os.path.dirname(filepath), data_folder)
        else:
            data_folder = os.path.dirname(filepath)

        # Note: Valid Relion image "_data.star" files have to have their data in the first loop of the first block.
        # We are getting the first (and only) block in this StarFile object
        df = StarFile(filepath).get_block_by_index(0)
        column_types = {name: cls.metadata_fields.get(name, str) for name in df.columns}
        df = df.astype(column_types)

        df[["__mrc_index", "__mrc_filename"]] = df["_rlnImageName"].str.split(
            "@", 1, expand=True
        )
        df["__mrc_index"] = pd.to_numeric(df["__mrc_index"])

        # Adding a full-filepath field to the Dataframe helps us save time later
        # Note that os.path.join works as expected when the second argument is an absolute path itself
        df["__mrc_filepath"] = df["__mrc_filename"].apply(
            lambda filename: os.path.join(data_folder, filename)
        )

        if max_rows is None:
            return df
        else:
            return df.iloc[:max_rows]
Esempio n. 2
0
 def testArgsError(self):
     with self.assertRaises(StarFileError):
         _blocks = OrderedDict()
         _blocks[""] = DataFrame(["test", "data"])
         with importlib_resources.path(tests.saved_test_data,
                                       "sample_data_model.star") as path:
             StarFile(filepath=path, blocks=_blocks)
    def testSave(self):
        # Save the StarFile object to disk,
        #   read it back, and check for equality.
        # Note that __eq__ is supported for StarFile/StarFileBlock classes

        with open("sample_saved.star", "w") as f:
            self.starfile.save(f)
        self.starfile2 = StarFile("sample_saved.star")
        self.assertEqual(self.starfile, self.starfile2)

        os.remove("sample_saved.star")
Esempio n. 4
0
    def testReadWriteReadBack(self):
        # Save the StarFile object to a .star file
        # Read it back for object equality
        # Note that __eq__ is supported for the class
        # it checks the equality of the underlying OrderedDicts of DataFrames
        # using pd.DataFrame.equals()
        test_outfile = os.path.join(self.tmpdir, "sample_saved.star")
        self.starfile.write(test_outfile)
        starfile2 = StarFile(test_outfile)
        self.assertEqual(self.starfile, starfile2)

        os.remove(test_outfile)
Esempio n. 5
0
    def setUp(self):
        with importlib_resources.path(tests.saved_test_data,
                                      "sample_data_model.star") as path:
            self.starfile = StarFile(path)

        # Independent Image object for testing Image source methods
        L = 768
        self.im = Image(misc.face(gray=True).astype("float64")[:L, :L])
        self.img_src = ArrayImageSource(self.im)

        # We also want to flex the stack logic.
        self.n = 21
        im_stack = np.broadcast_to(self.im.data, (self.n, L, L))
        # make each image methodically different
        im_stack = np.multiply(im_stack, np.arange(self.n)[:, None, None])
        self.im_stack = Image(im_stack)
        self.img_src_stack = ArrayImageSource(self.im_stack)

        # Create a tmpdir object for this test instance
        self._tmpdir = tempfile.TemporaryDirectory()
        # Get the directory from the name attribute of the instance
        self.tmpdir = self._tmpdir.name
Esempio n. 6
0
    def write_star(self, df1, df2, ang, cs, voltage, pixel_size, amp, name,
                   output_dir):
        """
        Writes CTF parameters to starfile.
        """

        if not os.path.isdir(output_dir):
            os.mkdir(output_dir)
        data_block = {}
        data_block["_rlnMicrographName"] = name
        data_block["_rlnDefocusU"] = df1
        data_block["_rlnDefocusV"] = df2
        data_block["_rlnDefocusAngle"] = ang
        data_block["_rlnSphericalAbberation"] = cs
        data_block["_rlnAmplitudeContrast"] = amp
        data_block["_rlnVoltage"] = voltage
        data_block["_rlnDetectorPixelSize"] = pixel_size
        df = DataFrame([data_block])
        blocks = OrderedDict()
        blocks["root"] = df
        star = StarFile(blocks=blocks)
        star.write(
            os.path.join(output_dir,
                         os.path.splitext(name)[0]) + ".star")
Esempio n. 7
0
    def testWriteReadWriteBack(self):
        # setup our temp filenames
        test_outfile = os.path.join(self.tmpdir, "sample_saved.star")
        test_outfile2 = os.path.join(self.tmpdir, "sampled_saved2.star")

        # create a new StarFile object directly via an OrderedDict of DataFrames
        # not by reading a file
        data = OrderedDict()
        # note that GEMMI requires the names of the fields to start with _
        # initialize a key-value set (a set of pairs in GEMMI parlance)
        block0 = {
            "_key1": "val1",
            "_key2": "val2",
            "_key3": "val3",
            "_key4": "val4"
        }
        # initialize a single-row loop. we want this to be distinct from a
        # set of key-value pairs
        block1_dict = {"_field1": 31, "_field2": 32, "_field3": 33}
        block1 = DataFrame([block1_dict], columns=block1_dict.keys())
        block2_keys = ["_field4", "_field5", "_field6"]
        block2_arr = [[f"{x}{y}" for x in range(3)] for y in range(3)]
        # initialize a loop data block with a list of lists
        block2 = DataFrame(block2_arr, columns=block2_keys)
        data["pair"] = block0
        data["single_row"] = block1
        data["loops"] = block2
        # initialize with blocks kwarg
        original = StarFile(blocks=data)
        original.write(test_outfile)
        read_back = StarFile(test_outfile)
        # assert that the read-back objects are equal
        self.assertEqual(original, read_back)
        # write back the second star file object
        read_back.write(test_outfile2)
        # compare the two .star files line by line
        with open(test_outfile) as f_original, open(
                test_outfile2) as f_read_back:
            lines_original = f_original.readlines()
            lines_read_back = f_read_back.readlines()
            self.assertEqual(lines_original, lines_read_back)

        os.remove(test_outfile)
        os.remove(test_outfile2)
Esempio n. 8
0
class StarFileTestCase(TestCase):
    def setUp(self):
        with importlib_resources.path(tests.saved_test_data,
                                      "sample_data_model.star") as path:
            self.starfile = StarFile(path)

        # Independent Image object for testing Image source methods
        L = 768
        self.im = Image(misc.face(gray=True).astype("float64")[:L, :L])
        self.img_src = ArrayImageSource(self.im)

        # We also want to flex the stack logic.
        self.n = 21
        im_stack = np.broadcast_to(self.im.data, (self.n, L, L))
        # make each image methodically different
        im_stack = np.multiply(im_stack, np.arange(self.n)[:, None, None])
        self.im_stack = Image(im_stack)
        self.img_src_stack = ArrayImageSource(self.im_stack)

        # Create a tmpdir object for this test instance
        self._tmpdir = tempfile.TemporaryDirectory()
        # Get the directory from the name attribute of the instance
        self.tmpdir = self._tmpdir.name

    def tearDown(self):
        # Destroy the tmpdir instance and contents
        self._tmpdir.cleanup()

    def testLength(self):
        # StarFile is an iterable that gives us blocks
        # blocks are pandas DataFrames
        # We have 6 blocks in our sample starfile.
        self.assertEqual(6, len(self.starfile))

    def testIteration(self):
        # A StarFile can be iterated over, yielding DataFrames for loops
        # or dictionaries for pairs
        for _, block in self.starfile:
            self.assertTrue(
                isinstance(block, DataFrame) or isinstance(block, dict))

    def testBlockByIndex(self):
        # We can use get_block_by_index to retrieve the blocks in
        # the OrderedDict by index
        # our first block is a set of pairs, represented by a dict
        block0 = self.starfile.get_block_by_index(0)
        self.assertTrue(isinstance(block0, dict))
        self.assertEqual(block0["_rlnReferenceDimensionality"], "3")
        # our second block is a loop, represented by a DataFrame
        block1 = self.starfile.get_block_by_index(1)
        self.assertTrue(isinstance(block1, DataFrame))
        self.assertEqual(block1.at[0, "_rlnClassDistribution"], "1.000000")

    def testBlockByName(self):
        # Indexing a StarFile with a string gives us a block with that name
        #   ("data_<name>" in starfile).
        # the block at index 0 has the name 'model_general'
        block0 = self.starfile["model_general"]
        # this block is a pair/dict with 22 key value pairs
        self.assertEqual(len(block0), 22)
        # the block at index 1 has name 'model_classes'
        block1 = self.starfile["model_classes"]
        # This block is a loop/DF with one row
        self.assertEqual(len(block1), 1)

    def testData(self):
        df = self.starfile["model_class_1"]
        self.assertEqual(76, len(df))
        self.assertEqual(8, len(df.columns))
        # Note that no typecasting of values is performed at io.StarFile level
        self.assertEqual(
            "0.000000",
            df[df["_rlnSpectralIndex"] == "0"].iloc[0]["_rlnResolution"])

    def testFileNotFound(self):
        with self.assertRaises(FileNotFoundError):
            StarFile("badfile.star")

    def testReadWriteReadBack(self):
        # Save the StarFile object to a .star file
        # Read it back for object equality
        # Note that __eq__ is supported for the class
        # it checks the equality of the underlying OrderedDicts of DataFrames
        # using pd.DataFrame.equals()
        test_outfile = os.path.join(self.tmpdir, "sample_saved.star")
        self.starfile.write(test_outfile)
        starfile2 = StarFile(test_outfile)
        self.assertEqual(self.starfile, starfile2)

        os.remove(test_outfile)

    def testWriteReadWriteBack(self):
        # setup our temp filenames
        test_outfile = os.path.join(self.tmpdir, "sample_saved.star")
        test_outfile2 = os.path.join(self.tmpdir, "sampled_saved2.star")

        # create a new StarFile object directly via an OrderedDict of DataFrames
        # not by reading a file
        data = OrderedDict()
        # note that GEMMI requires the names of the fields to start with _
        # initialize a key-value set (a set of pairs in GEMMI parlance)
        block0 = {
            "_key1": "val1",
            "_key2": "val2",
            "_key3": "val3",
            "_key4": "val4"
        }
        # initialize a single-row loop. we want this to be distinct from a
        # set of key-value pairs
        block1_dict = {"_field1": 31, "_field2": 32, "_field3": 33}
        block1 = DataFrame([block1_dict], columns=block1_dict.keys())
        block2_keys = ["_field4", "_field5", "_field6"]
        block2_arr = [[f"{x}{y}" for x in range(3)] for y in range(3)]
        # initialize a loop data block with a list of lists
        block2 = DataFrame(block2_arr, columns=block2_keys)
        data["pair"] = block0
        data["single_row"] = block1
        data["loops"] = block2
        # initialize with blocks kwarg
        original = StarFile(blocks=data)
        original.write(test_outfile)
        read_back = StarFile(test_outfile)
        # assert that the read-back objects are equal
        self.assertEqual(original, read_back)
        # write back the second star file object
        read_back.write(test_outfile2)
        # compare the two .star files line by line
        with open(test_outfile) as f_original, open(
                test_outfile2) as f_read_back:
            lines_original = f_original.readlines()
            lines_read_back = f_read_back.readlines()
            self.assertEqual(lines_original, lines_read_back)

        os.remove(test_outfile)
        os.remove(test_outfile2)

    def testArgsError(self):
        with self.assertRaises(StarFileError):
            _blocks = OrderedDict()
            _blocks[""] = DataFrame(["test", "data"])
            with importlib_resources.path(tests.saved_test_data,
                                          "sample_data_model.star") as path:
                StarFile(filepath=path, blocks=_blocks)

    def testEmptyInit(self):
        empty = StarFile()
        self.assertTrue(isinstance(empty.blocks, OrderedDict))
        self.assertEqual(len(empty.blocks), 0)
Esempio n. 9
0
 def testEmptyInit(self):
     empty = StarFile()
     self.assertTrue(isinstance(empty.blocks, OrderedDict))
     self.assertEqual(len(empty.blocks), 0)
Esempio n. 10
0
 def testFileNotFound(self):
     with self.assertRaises(FileNotFoundError):
         StarFile("badfile.star")
Esempio n. 11
0
class StarFileTestCase(TestCase):
    def setUp(self):
        with importlib_resources.path(tests.saved_test_data,
                                      "sample.star") as path:
            self.starfile = StarFile(path)

        # Independent Image object for testing Image source methods
        L = 768
        self.im = Image(misc.face(gray=True).astype("float64")[:L, :L])
        self.img_src = ArrayImageSource(self.im)

        # We also want to flex the stack logic.
        self.n = 21
        im_stack = np.broadcast_to(self.im.data, (self.n, L, L))
        # make each image methodically different
        im_stack = np.multiply(im_stack, np.arange(self.n)[:, None, None])
        self.im_stack = Image(im_stack)
        self.img_src_stack = ArrayImageSource(self.im_stack)

        # Create a tmpdir object for this test instance
        self._tmpdir = tempfile.TemporaryDirectory()
        # Get the directory from the name attribute of the instance
        self.tmpdir = self._tmpdir.name

    def tearDown(self):
        # Destroy the tmpdir instance and contents
        self._tmpdir.cleanup()

    def testLength(self):
        # StarFile is an iterable that gives us blocks.
        #   We have 2 blocks in our sample starfile.
        self.assertEqual(2, len(self.starfile))

    def testIteration(self):
        # A StarFile can be iterated over, yielding StarFileBlocks
        for block in self.starfile:
            self.assertTrue(isinstance(block, StarFileBlock))

    def testBlockByIndex(self):
        # Indexing a StarFile with a 0-based index gives us a 'block',
        block0 = self.starfile[0]
        self.assertTrue(isinstance(block0, StarFileBlock))
        # Our first block has no 'loop's.
        self.assertEqual(0, len(block0))

    def testBlockByName(self):
        # Indexing a StarFile with a string gives us a block with that name
        #   ("data_<name>" in starfile).
        # In our case the block at index 1 has name 'planetary'
        block1 = self.starfile["planetary"]
        # This block has a two 'loops'.
        self.assertEqual(2, len(block1))

    def testBlockProperties(self):
        # A StarFileBlock may have attributes that were read from the
        #   starfile key=>value pairs.
        block0 = self.starfile["general"]
        # Note that no typecasting is performed
        self.assertEqual(block0._three, "3")

    def testLoop(self):
        loop = self.starfile[1][0]
        self.assertIsInstance(loop, DataFrame)

    def testData1(self):
        df = self.starfile["planetary"][0]
        self.assertEqual(8, len(df))
        self.assertEqual(4, len(df.columns))
        # Note that no typecasting of values is performed at io.StarFile level
        self.assertEqual("1", df[df["_name"] == "Earth"].iloc[0]["_gravity"])

    def testData2(self):
        df = self.starfile["planetary"][1]
        self.assertEqual(3, len(df))
        self.assertEqual(2, len(df.columns))
        # Missing values in a loop default to ''
        self.assertEqual(
            "", df[df["_name"] == "Earth"].iloc[0]["_discovered_year"])

    def testSave(self):
        # Save the StarFile object to disk,
        #   read it back, and check for equality.
        # Note that __eq__ is supported for StarFile/StarFileBlock classes

        with open("sample_saved.star", "w") as f:
            self.starfile.save(f)
        self.starfile2 = StarFile("sample_saved.star")
        self.assertEqual(self.starfile, self.starfile2)

        os.remove("sample_saved.star")
Esempio n. 12
0
    def save_metadata(self,
                      starfile_filepath,
                      new_mrcs=True,
                      batch_size=512,
                      save_mode=None):
        """
        Save updated metadata to a STAR file

        :param starfile_filepath: Path to STAR file where we want to
            save image_source
        :param new_mrcs: Whether to save all images to new MRCS files or not.
            If True, new file names and pathes need to be created.
        :param batch_size: Batch size of images to query from the
            `ImageSource` object. Every `batch_size` rows, entries are
            written to STAR file.
        :param save_mode: Whether to save all images in a single or
            multiple files in batch size.
        :return: None
        """

        df = self._metadata.copy()
        # Drop any column that doesn't start with a *single* underscore
        df = df.drop(
            [
                str(col) for col in df.columns
                if not col.startswith("_") or col.startswith("__")
            ],
            axis=1,
        )

        with open(starfile_filepath, "w") as f:
            if new_mrcs:
                # Create a new column that we will be populating in the loop below
                # For
                df["_rlnImageName"] = ""

                if save_mode == "single":
                    # Save all images into one single mrc file
                    fname = os.path.basename(starfile_filepath)
                    fstem = os.path.splitext(fname)[0]
                    mrcs_filename = f"{fstem}_{0}_{self.n-1}.mrcs"

                    # Then set name in dataframe for the StarFile
                    # Note, here the row_indexer is :, representing all rows in this data frame.
                    #   df.loc will be reponsible for dereferencing and assigning values to df.
                    #   Pandas will assert df.shape[0] == self.n
                    df.loc[:, "_rlnImageName"] = [
                        f"{j + 1:06}@{mrcs_filename}" for j in range(self.n)
                    ]
                else:
                    # save all images into multiple mrc files in batch size
                    for i_start in np.arange(0, self.n, batch_size):
                        i_end = min(self.n, i_start + batch_size)
                        num = i_end - i_start
                        mrcs_filename = (os.path.splitext(
                            os.path.basename(starfile_filepath))[0] +
                                         f"_{i_start}_{i_end-1}.mrcs")

                        # Note, here the row_indexer is a slice.
                        #   df.loc will be reponsible for dereferencing and assigning values to df.
                        #   Pandas will assert the lnegth of row_indexer equals num.
                        row_indexer = df[i_start:i_end].index
                        df.loc[row_indexer, "_rlnImageName"] = [
                            "{0:06}@{1}".format(j + 1, mrcs_filename)
                            for j in range(num)
                        ]

            filename_indices = df._rlnImageName.str.split(
                pat="@", expand=True)[1].tolist()

            # initial the star file object and save it
            starfile = StarFile(blocks=[StarFileBlock(loops=[df])])
            starfile.save(f)

        return filename_indices
Esempio n. 13
0
    def save_metadata(
        self, starfile_filepath, new_mrcs=True, batch_size=512, save_mode=None
    ):
        """
        Save updated metadata to a STAR file

        :param starfile_filepath: Path to STAR file where we want to
            save image_source
        :param new_mrcs: Whether to save all images to new MRCS files or not.
            If True, new file names and pathes need to be created.
        :param batch_size: Batch size of images to query from the
            `ImageSource` object. Every `batch_size` rows, entries are
            written to STAR file.
        :param save_mode: Whether to save all images in a single or
            multiple files in batch size.
        :return: None
        """

        df = self._metadata.copy()
        # Drop any column that doesn't start with a *single* underscore
        df = df.drop(
            [
                str(col)
                for col in df.columns
                if not col.startswith("_") or col.startswith("__")
            ],
            axis=1,
        )

        filename_indices = None

        with open(starfile_filepath, "w") as f:
            if new_mrcs:
                # Create a new column that we will be populating in the loop below
                # For
                df["_rlnImageName"] = ""

                if save_mode == "single":
                    # Save all images into one single mrc file
                    fname = os.path.basename(starfile_filepath)
                    fstem = os.path.splitext(fname)[0]
                    mrcs_filename = f"{fstem}_{0}_{self.n-1}.mrcs"

                    # Then set name in dataframe for the StarFile
                    df["_rlnImageName"][0 : self.n] = pd.Series(
                        [f"{j + 1:06}@{mrcs_filename}" for j in range(self.n)]
                    )

                else:
                    # save all images into multiple mrc files in batch size
                    for i_start in np.arange(0, self.n, batch_size):
                        i_end = min(self.n, i_start + batch_size)
                        num = i_end - i_start
                        mrcs_filename = (
                            os.path.splitext(os.path.basename(starfile_filepath))[0]
                            + f"_{i_start}_{i_end-1}.mrcs"
                        )

                        df["_rlnImageName"][i_start:i_end] = pd.Series(
                            [
                                "{0:06}@{1}".format(j + 1, mrcs_filename)
                                for j in range(num)
                            ]
                        )

            filename_indices = [
                df["_rlnImageName"][i].split("@")[1] for i in range(self.n)
            ]

            # initial the star file object and save it
            starfile = StarFile(blocks=[StarFileBlock(loops=[df])])
            starfile.save(f)

        return filename_indices