def tearDownClass(cls): # delete created objects Breed.objects().delete() Counter.objects().delete() Dataset.objects().delete() super().tearDownClass()
def main(species, assembly): logger.info(f"{Path(__file__).name} started") # find assembly configuration if assembly not in WORKING_ASSEMBLIES: raise Exception(f"assembly {assembly} not managed by smarter") # open a file to track files to merge smarter_tag = "SMARTER-{specie}-{assembly}-top-{version}".format( specie=SPECIES2CODE[species.capitalize()], assembly=assembly.upper(), version=__version__) merge_file = get_interim_dir() / smarter_tag with merge_file.open(mode="w") as handle: for dataset in Dataset.objects(species=species.capitalize()): logger.debug(f"Got {dataset}") # search for result dir results_dir = Path(dataset.result_dir) / assembly.upper() if results_dir.exists(): logger.info(f"Found {results_dir}") # search for bed files bed_files = results_dir.glob('*.bed') # I can have more than 1 file for dataset (If one or more # files are included into dataset) for bed_file in bed_files: # determine the bedfile full path prefix = results_dir / bed_file.stem logger.info(f"Appeding {prefix} for merge") # track file to merge handle.write(f"{prefix}\n") # ok check for results dir final_dir = get_processed_dir() / assembly final_dir.mkdir(parents=True, exist_ok=True) # ok time to convert data in plink binary format cmd = ["plink"] + PLINK_SPECIES_OPT[dataset.species] + [ "--merge-list", f"{merge_file}", "--make-bed", "--out", f"{final_dir / smarter_tag}" ] # debug logger.info("Executing: " + " ".join(cmd)) subprocess.run(cmd, check=True) logger.info(f"{Path(__file__).name} ended")
def fetch_and_check_dataset(archive: str, contents: list[str]) -> [Dataset, list[Path]]: """Common operations on dataset: fetch a dataset by file (submitted archive), check that working dir exists and required file contents is in dataset. Test and get full path of required files Args: archive (str): the dataset archive (file) contents (list): a list of files which beed to be defined in dataset Returns: Dataset: a dataset instance list[Path]: a list of Path of required files """ # get the dataset object dataset = Dataset.objects(file=archive).get() logger.debug(f"Found {dataset}") # check for working directory working_dir = dataset.working_dir if not working_dir.exists(): raise FileNotFoundError( f"Could find dataset directory '{working_dir}'") # check files are in dataset not_found = [] contents_path = [] for item in contents: if item not in dataset.contents: logger.error(f"Couldn't find '{item}' in dataset: '{dataset}'") not_found.append(item) continue item_path = working_dir / item if not item_path.exists(): logger.error( f"Couldn't find '{item_path}' in dir: '{working_dir}'") not_found.append(item) continue contents_path.append(item_path) if len(not_found) > 0: raise FileNotFoundError(f"Couldn't find '{not_found}'") return dataset, contents_path
def test_update_pedfile(self): # get a dataset dataset = Dataset.objects(file="test.zip").get() # create a temporary directory using the context manager with tempfile.TemporaryDirectory() as tmpdirname: outfile = pathlib.Path(tmpdirname) / "plinktest_updated.ped" self.plinkio.update_pedfile(str(outfile), dataset, 'ab', fid="TEX") # now open outputfile and test stuff test = TextPlinkIO(mapfile=str(DATA_DIR / "plinktest.map"), pedfile=str(outfile)) # assert two records written self.assertEqual(len(list(test.read_pedfile())), 2)
def test_process_pedline(self): # define reference reference = [ 'TEX', 'ITOA-TEX-000000001', '0', '0', '0', -9, 'A', 'A', 'G', 'G' ] # get a line for testing line = self.lines[0] # get a dataset dataset = Dataset.objects(file="test.zip").get() test = self.plinkio._process_pedline(line, dataset, 'ab') self.assertEqual(reference, test)
def setUp(self): super().setUp() self.plinkio = TextPlinkIO(prefix=str(DATA_DIR / "plinktest"), species="Sheep") # read info from map self.plinkio.read_mapfile() self.plinkio.fetch_coordinates(version="Oar_v3.1", imported_from="SNPchiMp v.3") # read first line of ped file self.lines = list(self.plinkio.read_pedfile()) # get a dataset self.dataset = Dataset.objects(file="test.zip").get()
def main(species, name, code, alias, dataset): """Add or update a breed into SMARTER database""" logger.info(f"{Path(__file__).name} started") # get the dataset object dataset = Dataset.objects(file=dataset).get() # fix input parameters aliases = [BreedAlias(fid=fid, dataset=dataset) for fid in alias] species = species.capitalize() code = code.upper() # get a breed object relying on parameters breed, modified = get_or_create_breed( species=species, name=name, code=code, aliases=aliases) if modified: logger.info(f"{breed} added to database") logger.info(f"{Path(__file__).name} ended")
def main(file_, bfile, dataset, coding, chip_name, assembly): """Read sample names from map/ped files and updata smarter database (insert a record if necessary and define a smarter id for each sample) """ logger.info(f"{Path(__file__).name} started") # find assembly configuration if assembly not in WORKING_ASSEMBLIES: raise Exception(f"assembly {assembly} not managed by smarter") assembly_conf = WORKING_ASSEMBLIES[assembly] # get the dataset object dataset = Dataset.objects(file=dataset).get() logger.debug(f"Found {dataset}") if file_: plinkio, output_dir, output_map, output_ped = deal_with_text_plink( file_, dataset, assembly) elif bfile: plinkio, output_dir, output_map, output_ped = deal_with_binary_plink( bfile, dataset, assembly) # check chip_name illumina_chip = IlluminaChip.objects(name=chip_name).get() # set chip name for this sample plinkio.chip_name = illumina_chip.name # test if I have already run this analysis # ok check for results dir results_dir = dataset.result_dir results_dir = results_dir / assembly results_dir.mkdir(parents=True, exist_ok=True) # define final filename final_prefix = results_dir / output_ped.stem # test for processed files existance if plink_binary_exists(final_prefix): logger.warning(f"Skipping {dataset} processing: {final_prefix} exists") logger.info(f"{Path(__file__).name} ended") return # if I arrive here, I can create output files # read mapdata and read updated coordinates from db plinkio.read_mapfile() # fetch coordinates relying assembly configuration plinkio.fetch_coordinates(version=assembly_conf.version, imported_from=assembly_conf.imported_from) logger.info("Writing a new map file with updated coordinates") plinkio.update_mapfile(str(output_map)) logger.info("Writing a new ped file with fixed genotype") plinkio.update_pedfile(output_ped, dataset, coding) # ok time to convert data in plink binary format cmd = ["plink"] + PLINK_SPECIES_OPT[dataset.species] + [ "--file", f"{output_dir / output_ped.stem}", "--make-bed", "--out", f"{final_prefix}" ] # debug logger.info("Executing: " + " ".join(cmd)) subprocess.run(cmd, check=True) logger.info(f"{Path(__file__).name} ended")
def main(input_filepath, output_filepath, types): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) # connect to database global_connection() with open(input_filepath) as handle: reader = csv.reader(handle, delimiter=";") header = next(reader) # remove header id del (header[0]) # sanitize column header = [sanitize(col) for col in header] logger.info("Got %s as header" % header) # define a datatype for my data Record = collections.namedtuple("Record", header) for line in reader: # remove id from record del (line[0]) # remove empty string line = [col if col != '' else None for col in line] record = Record._make(line) logger.debug(record) # search for the archive file archive = next(project_dir.rglob(record.file)) logger.info(f"Found {archive} as archive") archive = zipfile.ZipFile(archive) logger.debug("Get file contents") contents = archive.namelist() logger.debug(contents) # insert or update with a mongodb method dataset = Dataset.objects( file=record.file).upsert_one(**record._asdict(), type_=types, contents=contents) # ok extract content to working directory # TODO: don't work with plain text files, try to work with # compressed data working_dir = project_dir / f"data/interim/{dataset.id}" working_dir.mkdir(exist_ok=True) for member in contents: test = working_dir / member if not test.exists(): logger.info(f"Extract '{member}': in '{working_dir}'") archive.extract(member, working_dir) else: logger.debug(f"Skipping {member}: already extracted") with open(output_filepath, "w") as handle: # after insert collect all data of the same type handle.write(Dataset.objects.to_json(indent=2)) logger.info(f"Data written into database and in {output_filepath}")