def add_metadata_by_sample( data: pd.DataFrame, dst_dataset: Dataset, columns: dict): """Add metadata relying on breed name (column)""" # mind dataset species SampleSpecie = get_sample_species(dst_dataset.species) for index, row in data.iterrows(): original_id = row.get(columns["id_column"]) # get additional columns for original_id location = get_locations(row, columns, original_id) metadata = get_metadata(row, columns, original_id) # ok iterate over all samples of this dataset for sample in SampleSpecie.objects.filter( dataset=dst_dataset, original_id=original_id): logger.info(f"Updating '{sample}'") # set location features sample.location = location # set metadata if necessary if metadata: sample.metadata = metadata # update sample sample.save()
def add_phenotype_by_breed(data: pd.DataFrame, dst_dataset: Dataset, columns: dict): """Add metadata relying on breed name (column)""" logger.debug(f"Received columns: {columns}") # mind dataset species SampleSpecie = get_sample_species(dst_dataset.species) for index, row in data.iterrows(): logger.debug(f"{index}, {row}") breed = row.get(columns["breed_column"]) # get columns modelled in smarter database named_columns = get_named_columns(row, columns, breed) # get additional columns for breed additional_column = get_additional_column(row, columns, breed) # ok iterate over all samples of this dataset for sample in SampleSpecie.objects.filter(dataset=dst_dataset, breed=breed): create_or_update_phenotype(sample, named_columns, additional_column)
def test_get_sample_goat(self): SampleSpecies = get_sample_species(species="Goat") self.assertEqual(SampleSpecies, SampleGoat)
def test_get_sample_sheep(self): SampleSpecies = get_sample_species(species="Sheep") self.assertEqual(SampleSpecies, SampleSheep)
def main( src_dataset, dst_dataset, datafile, code_column, country_column, id_column, sex_column, chip_name): logger.info(f"{Path(__file__).name} started") # custom method to check a dataset and ensure that needed stuff exists src_dataset, [datapath] = fetch_and_check_dataset( archive=src_dataset, contents=[datafile] ) # this will be the dataset used to define samples dst_dataset, _ = fetch_and_check_dataset( archive=dst_dataset, contents=[] ) # mind dataset species SampleSpecie = get_sample_species(dst_dataset.species) # read datafile data = pandas_open(datapath) logger.info(f"Got columns: {data.columns.to_list()}") for index, row in data.iterrows(): logger.debug(f"Got: {row.to_list()}") code = row.get(code_column) country = row.get(country_column) original_id = row.get(id_column) sex = None if sex_column: sex = str(row.get(sex_column)) sex = SEX.from_string(sex) # drop sex column if unknown if sex == SEX.UNKNOWN: sex = None logger.debug( f"Got code: {code}, country: {country}, " f"original_id: {original_id}, sex: {sex}" ) # process a country by doing a fuzzy search # HINT: this function cache results relying arguments using lru_cache # see find country implementation for more informations country = find_country(country) # get breed from database breed = Breed.objects( aliases__match={'fid': code, 'dataset': dst_dataset}).get() logger.debug(f"found breed '{breed}'") # get or create a new Sample Obj sample, created = get_or_create_sample( SampleSpecie, original_id, dst_dataset, breed, country.name, chip_name, sex) if created: logger.info(f"Sample '{sample}' added to database") logger.info(f"{Path(__file__).name} ended")