def main(): logger = logging.getLogger(__name__) # not used in this stub but often useful for finding various files project_dir = Path(__file__).resolve().parents[2] dbsnp_file = ("data/external/SHE/dbSNP/ds_ch24.xml.gz") dbsnp_path = project_dir / dbsnp_file # connect to database global_connection() # connect to database conn = MongoClient('mongodb://localhost:27017/', username=os.getenv("MONGODB_SMARTER_USER"), password=os.getenv("MONGODB_SMARTER_PASS")) # get my snp names from database pipeline = [{ "$project": { "_id": 0, "name": 1 } }, { "$group": { "_id": None, "total": { "$sum": 1 }, "items": { "$push": "$name" } } }] logger.info("Search all names in database") # execute the aggregation pipeline with pymongo client and get all # snp names from database result = next(conn[SMARTERDB]["variantSheep"].aggregate(pipeline)) # get all names in a set all_snp_names = set(result["items"]) logger.info(f"Reading from {dbsnp_path}") # cicle amoung dbsnp object for snp in filter(search_agr_bs, read_dbSNP(dbsnp_path)): found = False for i, ss in enumerate(snp["ss"]): if ss['locSnpId'] in all_snp_names: found = True break if found is False: logger.debug(f"Skipping rsId {snp['rsId']}") continue snp['ss'] = snp['ss'][i] # get the locSnpId to search (ex. OAR24_20639954 name = snp['ss']['locSnpId'] # regex is super slow qs = VariantSheep.objects(name=name) if qs.count() > 0: # get variant try: variant = qs.get() logger.info(f"{variant}: {snp}") except MultipleObjectsReturned: logger.error(qs.all()) # debug break logger.info("Completed")
"""Add or update a breed into SMARTER database""" logger.info(f"{Path(__file__).name} started") # get the dataset object dataset = Dataset.objects(file=dataset).get() # fix input parameters aliases = [BreedAlias(fid=fid, dataset=dataset) for fid in alias] species = species.capitalize() code = code.upper() # get a breed object relying on parameters breed, modified = get_or_create_breed( species=species, name=name, code=code, aliases=aliases) if modified: logger.info(f"{breed} added to database") logger.info(f"{Path(__file__).name} ended") if __name__ == '__main__': log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(level=logging.INFO, format=log_fmt) # connect to database global_connection() main()
def main(input_filepath, output_filepath, types): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) # connect to database global_connection() with open(input_filepath) as handle: reader = csv.reader(handle, delimiter=";") header = next(reader) # remove header id del (header[0]) # sanitize column header = [sanitize(col) for col in header] logger.info("Got %s as header" % header) # define a datatype for my data Record = collections.namedtuple("Record", header) for line in reader: # remove id from record del (line[0]) # remove empty string line = [col if col != '' else None for col in line] record = Record._make(line) logger.debug(record) # search for the archive file archive = next(project_dir.rglob(record.file)) logger.info(f"Found {archive} as archive") archive = zipfile.ZipFile(archive) logger.debug("Get file contents") contents = archive.namelist() logger.debug(contents) # insert or update with a mongodb method dataset = Dataset.objects( file=record.file).upsert_one(**record._asdict(), type_=types, contents=contents) # ok extract content to working directory # TODO: don't work with plain text files, try to work with # compressed data working_dir = project_dir / f"data/interim/{dataset.id}" working_dir.mkdir(exist_ok=True) for member in contents: test = working_dir / member if not test.exists(): logger.info(f"Extract '{member}': in '{working_dir}'") archive.extract(member, working_dir) else: logger.debug(f"Skipping {member}: already extracted") with open(output_filepath, "w") as handle: # after insert collect all data of the same type handle.write(Dataset.objects.to_json(indent=2)) logger.info(f"Data written into database and in {output_filepath}")