Esempio n. 1
0
    def tearDownClass(cls):
        # delete created objects
        Breed.objects().delete()
        Counter.objects().delete()
        Dataset.objects().delete()

        super().tearDownClass()
Esempio n. 2
0
    def setUpClass(cls):
        # initialize the mongomock instance
        super().setUpClass()

        # need a dataset for certain tests
        dataset = Dataset(
            file="test.zip",
            country="Italy",
            species="Sheep",
            contents=[
                "plinktest.map",
                "plinktest.ped",
                "plinktest.fam",
                "plinktest.bim",
                "plinktest.bed",
                "snplist.txt",
                "finalreport.txt"
            ]
        )
        dataset.save()

        # need to define a breed in order to get a smarter id
        alias = BreedAlias(
            fid="TEX_IT",
            dataset=dataset,
            country="Italy"
        )

        breed = Breed(
            species="Sheep",
            name="Texel",
            code="TEX",
            n_individuals=0,
            aliases=[alias]
        )
        breed.save()

        # need also a counter object for sheep and goat
        counter = Counter(
            pk="sampleSheep",
            sequence_value=0
        )
        counter.save()

        counter = Counter(
            pk="sampleGoat",
            sequence_value=0
        )
        counter.save()
    def test_import_breeds_force_country(self, my_working_dir):
        # create a temporary directory using the context manager
        with tempfile.TemporaryDirectory() as tmpdirname:
            working_dir = pathlib.Path(tmpdirname)
            my_working_dir.return_value = working_dir

            # save worksheet in temporary folder
            self.workbook.save(f"{working_dir}/breed.xlsx")

            result = self.runner.invoke(self.main_function, [
                "--species",
                "sheep",
                "--dataset",
                "test.zip",
                "--datafile",
                "breed.xlsx",
                "--code_column",
                "Code",
                "--breed_column",
                "Name",
                "--country_column",
                "Country",
            ])

            self.assertEqual(0, result.exit_code)

            qs = Breed.objects()
            self.assertEqual(qs.count(), 1)

            breed = qs.get()
            alias = BreedAlias(fid="TEX",
                               dataset=self.dataset,
                               country="Italy")
            self.assertEqual(breed.aliases, [alias])
Esempio n. 4
0
    def test_sample_relies_dataset(self):
        """Getting two sample with the same original id is not a problem"""

        # get a sample line
        line = self.lines[0]

        # get a breed
        breed = Breed.objects(aliases__match={
            'fid': line[0],
            'dataset': self.dataset
        }).get()

        # create a copy of dataset
        new_dataset = deepcopy(self.dataset)

        new_dataset.file = "test2.zip"
        new_dataset.id = None
        new_dataset.save()

        # ok create a samplesheep object with the same original_id
        first = self.plinkio.get_or_create_sample(line, self.dataset, breed)
        second = self.plinkio.get_or_create_sample(line, new_dataset, breed)

        self.assertEqual(SampleSheep.objects.count(), 2)
        self.assertEqual(first.original_id, second.original_id)

        # need to delete second sample in order to remove the new dataset
        # (mongoengine.DENY behaviour for deleting samples)
        second.delete()
        first.delete()

        # reset database to original state
        new_dataset.delete()
    def test_add_breed(self):
        result = self.runner.invoke(self.main_function, [
            "--species", "sheep", "--name", "Texel", "--code", "TEX",
            "--dataset", "test.zip", "--alias", "TEXEL_IT", "--alias", "0"
        ])

        self.assertEqual(0, result.exit_code)

        qs = Breed.objects()
        self.assertEqual(qs.count(), 1)

        breed = qs.get()
        aliases = [
            BreedAlias(fid=fid, dataset=self.dataset)
            for fid in ["TEXEL_IT", "0"]
        ]
        self.assertEqual(breed.aliases, aliases)
Esempio n. 6
0
    def test_get_or_create_sample(self):
        # get a sample line
        line = self.lines[0]

        # get a breed
        breed = Breed.objects(aliases__match={
            'fid': line[0],
            'dataset': self.dataset
        }).get()

        # no individulas for such breeds
        self.assertEqual(breed.n_individuals, 0)
        self.assertEqual(SampleSheep.objects.count(), 0)

        # calling my function and collect sample
        reference = self.plinkio.get_or_create_sample(line, self.dataset,
                                                      breed)
        self.assertIsInstance(reference, SampleSheep)

        # assert an element in database
        self.assertEqual(SampleSheep.objects.count(), 1)

        # check individuals updated
        breed.reload()
        self.assertEqual(breed.n_individuals, 1)

        # calling this function twice, returns the same individual
        test = self.plinkio.get_or_create_sample(line, self.dataset, breed)
        self.assertIsInstance(test, SampleSheep)

        # assert an element in database
        self.assertEqual(SampleSheep.objects.count(), 1)

        # check individuals updated
        breed.reload()
        self.assertEqual(breed.n_individuals, 1)

        self.assertEqual(reference, test)
Esempio n. 7
0
def main(
        src_dataset, dst_dataset, datafile, code_column, country_column,
        id_column, sex_column, chip_name):
    logger.info(f"{Path(__file__).name} started")

    # custom method to check a dataset and ensure that needed stuff exists
    src_dataset, [datapath] = fetch_and_check_dataset(
        archive=src_dataset,
        contents=[datafile]
    )

    # this will be the dataset used to define samples
    dst_dataset, _ = fetch_and_check_dataset(
        archive=dst_dataset,
        contents=[]
    )

    # mind dataset species
    SampleSpecie = get_sample_species(dst_dataset.species)

    # read datafile
    data = pandas_open(datapath)

    logger.info(f"Got columns: {data.columns.to_list()}")

    for index, row in data.iterrows():
        logger.debug(f"Got: {row.to_list()}")
        code = row.get(code_column)
        country = row.get(country_column)
        original_id = row.get(id_column)
        sex = None

        if sex_column:
            sex = str(row.get(sex_column))
            sex = SEX.from_string(sex)

            # drop sex column if unknown
            if sex == SEX.UNKNOWN:
                sex = None

        logger.debug(
            f"Got code: {code}, country: {country}, "
            f"original_id: {original_id}, sex: {sex}"
        )

        # process a country by doing a fuzzy search
        # HINT: this function cache results relying arguments using lru_cache
        # see find country implementation for more informations
        country = find_country(country)

        # get breed from database
        breed = Breed.objects(
            aliases__match={'fid': code, 'dataset': dst_dataset}).get()

        logger.debug(f"found breed '{breed}'")

        # get or create a new Sample Obj
        sample, created = get_or_create_sample(
            SampleSpecie,
            original_id,
            dst_dataset,
            breed,
            country.name,
            chip_name,
            sex)

        if created:
            logger.info(f"Sample '{sample}' added to database")

    logger.info(f"{Path(__file__).name} ended")