def test_open_xls(self): # create a temporary directory using the context manager with tempfile.TemporaryDirectory() as tmpdirname: working_dir = pathlib.Path(tmpdirname) datapath = working_dir / "breed.xls" # save worksheet in temporary folder self.workbook.save(f"{datapath}") data = pandas_open(datapath) self.assertIsInstance(data, pd.DataFrame)
def main(species, dataset, datafile, code_column, breed_column, fid_column, country_column): logger.info(f"{Path(__file__).name} started") # custom method to check a dataset and ensure that needed stuff exists dataset, [datapath] = fetch_and_check_dataset(archive=dataset, contents=[datafile]) # read breed into data data = pandas_open(datapath) for index, row in data.iterrows(): code = row.get(code_column) name = row.get(breed_column) # by default, fid is equal to code if not fid_column: fid_column = code_column fid = row.get(fid_column) logger.debug(f"Got code: '{code}', breed_name: '{name}', " f"fid: '{fid}'") # deal with multi countries dataset country = None if country_column: country = row.get(country_column) # need to define also an alias in order to retrieve such breed when # dealing with original file alias = BreedAlias(fid=fid, dataset=dataset, country=country) try: breed, modified = get_or_create_breed(species=species, name=name, code=code, aliases=[alias]) if modified: logger.info(f"{breed} added to database") except NotUniqueError as e: logger.error(e) raise SmarterDBException( f"Got an error while inserting '{name}'. '{code}'") logger.info(f"{Path(__file__).name} ended")
def main(src_dataset, dst_dataset, datafile, sheet_name, breed_column, id_column, purpose_column, chest_girth_column, height_column, length_column, additional_column, na_values): logger.info(f"{Path(__file__).name} started") if additional_column: logger.debug(f"Got {additional_column} as additional phenotype") # custom method to check a dataset and ensure that needed stuff exists src_dataset, [datapath] = fetch_and_check_dataset(archive=src_dataset, contents=[datafile]) # this will be the dataset used to define samples dst_dataset, _ = fetch_and_check_dataset(archive=dst_dataset, contents=[]) if sheet_name and sheet_name.isnumeric(): sheet_name = int(sheet_name) # open data with pandas data = pandas_open(datapath, na_values=na_values, sheet_name=sheet_name) # collect columns in a dictionary columns = { 'breed_column': breed_column, 'id_column': id_column, 'purpose_column': purpose_column, 'chest_girth_column': chest_girth_column, 'height_column': height_column, 'length_column': length_column, 'additional_column': additional_column, } if breed_column: add_phenotype_by_breed(data, dst_dataset, columns) elif id_column: add_phenotype_by_sample(data, dst_dataset, columns) logger.info(f"{Path(__file__).name} ended")
def main(src_dataset, dst_dataset, datafile, breed_column, id_column, latitude_column, longitude_column, metadata_column, na_values): logger.info(f"{Path(__file__).name} started") if metadata_column: logger.warning(f"Got {metadata_column} as additional metadata") # custom method to check a dataset and ensure that needed stuff exists src_dataset, [datapath] = fetch_and_check_dataset( archive=src_dataset, contents=[datafile] ) # this will be the dataset used to define samples dst_dataset, _ = fetch_and_check_dataset( archive=dst_dataset, contents=[] ) # open data with pandas data = pandas_open(datapath, na_values=na_values) # collect columns in a dictionary columns = { 'breed_column': breed_column, 'id_column': id_column, 'latitude_column': latitude_column, 'longitude_column': longitude_column, 'metadata_column': metadata_column, } if breed_column: add_metadata_by_breed(data, dst_dataset, columns) elif id_column: add_metadata_by_sample(data, dst_dataset, columns) logger.info(f"{Path(__file__).name} ended")
def test_open_csv(self): data = pandas_open(SCRIPTS_DATA_DIR / "test_manifest.csv") self.assertIsInstance(data, pd.DataFrame)
def main( src_dataset, dst_dataset, datafile, code_column, country_column, id_column, sex_column, chip_name): logger.info(f"{Path(__file__).name} started") # custom method to check a dataset and ensure that needed stuff exists src_dataset, [datapath] = fetch_and_check_dataset( archive=src_dataset, contents=[datafile] ) # this will be the dataset used to define samples dst_dataset, _ = fetch_and_check_dataset( archive=dst_dataset, contents=[] ) # mind dataset species SampleSpecie = get_sample_species(dst_dataset.species) # read datafile data = pandas_open(datapath) logger.info(f"Got columns: {data.columns.to_list()}") for index, row in data.iterrows(): logger.debug(f"Got: {row.to_list()}") code = row.get(code_column) country = row.get(country_column) original_id = row.get(id_column) sex = None if sex_column: sex = str(row.get(sex_column)) sex = SEX.from_string(sex) # drop sex column if unknown if sex == SEX.UNKNOWN: sex = None logger.debug( f"Got code: {code}, country: {country}, " f"original_id: {original_id}, sex: {sex}" ) # process a country by doing a fuzzy search # HINT: this function cache results relying arguments using lru_cache # see find country implementation for more informations country = find_country(country) # get breed from database breed = Breed.objects( aliases__match={'fid': code, 'dataset': dst_dataset}).get() logger.debug(f"found breed '{breed}'") # get or create a new Sample Obj sample, created = get_or_create_sample( SampleSpecie, original_id, dst_dataset, breed, country.name, chip_name, sex) if created: logger.info(f"Sample '{sample}' added to database") logger.info(f"{Path(__file__).name} ended")