def generate_records(study_profile, patients_num, male_perc, morbidities_data, end_year, output_format): """ Generates dataset record files using the specified parameters. :param study_profile: the study profile data dictionary (just name of code can be specified) :param patients_num: the number of patients :param male_perc: the male percentage of patients (between 0 and 100) :param morbidities_data: the list with morbidities details :param end_year: the end year for generated reports :param output_format: the output format :param dataset_manager: the dataset manager :param work_entity: the work_entity include dir and session :return: the total number of created report files """ logger.info("start new Thread " + threading.currentThread().getName()) logger.info('Creating {0} fictional patient{1}...'.format(patients_num, "s" if patients_num > 1 else "")) # first generate required number of random patients patients = [random_patient(idx, study_profile, end_year) for idx in range(1, patients_num + 1)] set_gender_fields(patients, male_perc) for morbidity_data in morbidities_data: apply_morbidity(patients, morbidity_data) files_num = 0 patients_len = len(patients) for index, patient in enumerate(patients): # sort expected problems by diagnosis date to make them appear sequentially in the report patient['expected_problems'].sort(key=lambda p: p['onset']) # initialize initial morbidities populate_current_problems(patient) # save initial report for the current patient create_file(patient, output_format, work_entity) patient_files_num = 1 initial_problems_num = len(patient['icd_problems']) # create and save new reports while patient is alive while patient['date_of_death'] is None: if not age_patient(patient, end_year): break create_file(patient, output_format, work_entity) patient_files_num += 1 final_problems_num = len(patient['icd_problems']) logger.info("Generated {0} files for {1} patient with {2} problems".format( patient_files_num, patient['gender'].lower(), "no" if final_problems_num == 0 else initial_problems_num if initial_problems_num == final_problems_num else str(initial_problems_num) + '-' + str(final_problems_num))) files_num += patient_files_num dataset_manager.update_entity(work_entity[1], 'Generating', index * 100 / patients_len) dataset_manager.update_entity(work_entity[1], 'Completed', 100) logger.info('Successfully generated {0} report files'.format(files_num)) logger.info('Thread run successfully, name = ' + threading.currentThread().getName()) return files_num
def preload_all_configurations(): """ preload all configurations into cache :return: None """ files = [ f for f in listdir(DATASET_CONFIGURATIONS_DIR) if isfile(join(DATASET_CONFIGURATIONS_DIR, f)) ] for file in files: configuration = read_configuration_from_file( DATASET_CONFIGURATIONS_DIR + '/' + file) get_cache_map()[configuration['title']] = configuration logger.info("succeed load configuration = " + configuration['title'])
def update_entity(self, name, status, progress): """ update dataset entity by name :param name: the dataset name :param status: the dataset status :param progress: the dataset progress :return: None """ entity = self.cache_map.get(name) if entity is not None: entity['status'] = status entity['progress'] = progress logger.info("%s status = %s, progress = %.2f%%" % (name, status, progress)) if status == DATASET_COMPLETED and progress >= 100: entity['completedOn'] = datetime.now().isoformat()
def preload_datasets(): """ preload all datasets into manager Get all dataset by scan output folder, if folder name start begin DATASET_PREFIX, that's mean this dataset generate by rest api If get configuration by title failed, then this api will skip the dataset :return: None """ datasets_folders = [ f for f in listdir(GENERATED_DATASETS_DIR) if isdir(join(GENERATED_DATASETS_DIR, f)) ] for dataset_name in datasets_folders: if not dataset_name.startswith( DATASET_PREFIX): # not generate by rest api continue dataset_parts = dataset_name.split('.') name = len( dataset_parts) > 1 and dataset_parts[1] or 'ERROR TO GET NAME' output_format = len(dataset_parts) > 3 and dataset_parts[3] or 'CCDA' try: configurations = dataset_configuration_service.get_configuration_by_title( name) dataset = { 'title': name, 'completedOn': datetime.fromtimestamp( stat(join(GENERATED_DATASETS_DIR, dataset_name)).st_mtime).isoformat(), 'configuration': configurations, 'status': DATASET_COMPLETED, 'progress': 100, 'outputFormat': output_format, 'datasetName': dataset_name } dataset_manager.push_entity(dataset_name, dataset) logger.info("succeed load dataset = " + dataset_name) except Exception as e: # if get configuration error, then skip this dataset, so we don't need raise error here logger.error(e)
def setup_work_session(output_dir, create_session_path=True, config_title=None, output_format='CCDA'): """ Create a unique work folder for the current session :param output_dir: the output directory for generated dataset files :param create_session_path: True if session directory should be created, False otherwise :param config_title: the config title used :param output_format: the output format :return: None """ global work_dir global session_id # generate new session ID from the current timestamp session_id = config_title and '{0}.{1}.{2}.{3}'.format( DATASET_PREFIX, config_title, datetime.datetime.now().strftime("%Y%m%d%H%M%S"), output_format) or datetime.datetime.now().isoformat().replace(':', '') work_dir = output_dir # load data sources if they haven't been loaded previously if not data_source: load_datasources() if not os.path.exists(work_dir): os.makedirs(work_dir) if create_session_path: # create subfolder for the current session session_path = "%(work_dir)s/%(session_id)s" % globals() # try to use short relative to the current directory path if possible try: session_path = os.path.relpath(session_path) except ValueError: pass if os.path.exists(session_path): shutil.rmtree(session_path) os.mkdir(session_path) logger.info("Using output folder " + session_path)
def generate_records(war_era, patients_num, male_perc, morbidities_data, end_year, output_format): """ Generates dataset record files using the specified parameters. :param war_era: the war era data dictionary (just name of code can be specified) :param patients_num: the number of patients :param male_perc: the male percentage of patients (between 0 and 100) :param morbidities_data: the list with morbidities details :param end_year: the end year for generated reports :param output_format: the output format :return: the total number of created report files """ logger.info('Creating {0} fictional patient{1}...'.format( patients_num, "s" if patients_num > 1 else "")) # first generate required number of random patients patients = [ random_patient(idx, war_era, end_year) for idx in range(1, patients_num + 1) ] set_gender_fields(patients, male_perc) for morbidity_data in morbidities_data: apply_morbidity(patients, morbidity_data) files_num = 0 for patient in patients: # sort expected problems by diagnosis date to make them appear sequentially in the report patient['expected_problems'].sort(key=lambda p: p['onset']) # initialize initial morbidities populate_current_problems(patient) # save initial report for the current patient create_file(patient, output_format) patient_files_num = 1 initial_problems_num = len(patient['icd_problems']) # create and save new reports while patient is alive while patient['date_of_death'] is None: if not age_patient(patient, end_year): break create_file(patient, output_format) patient_files_num += 1 final_problems_num = len(patient['icd_problems']) logger.info( "Generated {0} files for {1} patient with {2} problems".format( patient_files_num, patient['gender'].lower(), "no" if final_problems_num == 0 else initial_problems_num if initial_problems_num == final_problems_num else str(initial_problems_num) + '-' + str(final_problems_num))) files_num += patient_files_num logger.info('Successfully generated {0} report files'.format(files_num)) return files_num
def load_icd10_codes(): """ Read icd10cm_codes_2018.txt file, extract code/name pairs of known morbidities from it. The result is saved to the global icd_morbidity_name_by_code variable. :return: None """ global icd_morbidity_name_by_code icd_morbidity_name_by_code = {} # load the ICD-10 datasource try: logger.info('Reading ICD-10 datasource...') lines_num = 0 for line in open(ICD_10_CODES_FILE_PATH): code = line[:8].strip() name = line[8:].strip() icd_morbidity_name_by_code[code] = name lines_num += 1 logger.info('Loaded {0} records from ICD-10 datasource'.format(lines_num)) except Exception as e: logger.error('Could not open {0}. Error: {1}'.format(ICD_10_CODES_FILE_PATH, e)) raise
def generate_from_config(dataset_config, dataset_manager): """ Generate dataset records using the provided dataset configuration parameters and save them to files :param dataset_config: the dataset configuration dictionary :param dataset_manager: the dataset manager :return: None """ output_format = dataset_config.get('outputFormat', 'CCDA') if 'studyProfile' not in dataset_config: raise ValueError('Study profile is missing in the dataset configuration') study_profile = dataset_config['studyProfile'] full_study_profile = get_full_study_profile(study_profile) if full_study_profile is None: raise ValueError('Invalid study profile is specified in the dataset configuration: {0}'.format(study_profile)) if 'numberOfPatients' not in dataset_config: raise ValueError('Number of patients is missing in the dataset configuration') patients_num = dataset_config['numberOfPatients'] if patients_num <= 0: raise ValueError("Dataset configuration contains invalid number of patients: {0}".format(patients_num)) # detect male/female ratios, default is 100% males male_ratio = 100 if 'maleRatio' in dataset_config: male_ratio = float(dataset_config['maleRatio']) if 'femaleRatio' in dataset_config and male_ratio + float(dataset_config['femaleRatio']) != 100: raise ValueError("Both male ({0}) and female ({1}) ratios are specified in the dataset configuration, " "but their sum is not equal to 100".format(dataset_config['maleRatio'], dataset_config['femaleRatio'])) elif 'femaleRatio' in dataset_config: male_ratio = 100 - float(dataset_config['femaleRatio']) if male_ratio < 0 or male_ratio > 100: raise ValueError("Dataset configuration contains invalid male ratio: {0}".format(male_ratio)) # detect and check end year if 'year' not in dataset_config: raise ValueError('End report year is missing in the dataset configuration') end_year = dataset_config['year'] if end_year <= full_study_profile['start_date'].year: raise ValueError('End report year must be greater than start date') # setup output directory cur_work_dir = GENERATED_DATASETS_DIR if 'outputFolder' in dataset_config: cur_work_dir = dataset_config['outputFolder'] work_entity = setup_work_session(cur_work_dir, True, dataset_config['title'], output_format) # retrieve morbidities from configuration or data source morbidities_data = None if 'morbiditiesData' not in dataset_config: study_profile_code = full_study_profile['study_profile_code'] try: dataset_config['morbiditiesData'] = get_morbidities_from_study_profile_code(full_study_profile['study_profile_code'], include_percentage=True) except EntityNotFoundError: raise ValueError('CSV file for study profile with code {0} does not exist'.format(study_profile_code)) logger.info('Using morbidity probabilities of {0} from configuration file'.format(study_profile_code)) morbidities_data = dataset_config['morbiditiesData'] if ('relatedConditionsData' in dataset_config) and (len(dataset_config['relatedConditionsData']) > 0): morbidities_data.extend(dataset_config['relatedConditionsData']) # push new entity dataset_manager.push_entity(work_entity[1], { 'title': dataset_config['title'], 'completedOn': 'N/A', 'configuration': dataset_config, 'status': 'Generating', 'progress': 0, 'outputFormat': output_format, 'datasetName': work_entity[1] }) # start new thread generate_thread = threading.Thread(target=generate_records, args=(full_study_profile, patients_num, male_ratio, morbidities_data, end_year, output_format, dataset_manager, work_entity,)) generate_thread.start()
def frontend_app_folder(filename): """ Serve frontend folder resource and inject frontend url state :param filename: :return: the resource content """ if not os.path.exists(os.path.join(FRONTEND_DIR, filename)): return frontend_app_index() return send_from_directory(FRONTEND_DIR, filename) if __name__ == '__main__': # start flask app logger.info('Checking output directories...') # check and create dirs if not os.path.exists(DATASET_CONFIGURATIONS_DIR): os.makedirs(DATASET_CONFIGURATIONS_DIR) if not os.path.exists(GENERATED_DATASETS_DIR): os.makedirs(GENERATED_DATASETS_DIR) # init randomizer setup_work_session(GENERATED_DATASETS_DIR, create_session_path=False) logger.info('Starting app at port = {0}, with mode = {1}'.format( WEB_PORT, FLASK_RUN_MODE)) # inject routers init(app) app.run(debug=(FLASK_RUN_MODE == 'DEBUG'), port=int(WEB_PORT),
def generate_from_config(dataset_config): """ Generate dataset records using the provided dataset configuration parameters and save them to files :param dataset_config: the dataset configuration dictionary :return: None """ output_format = dataset_config.get('outputFormat', 'CCDA') if 'warEra' not in dataset_config: raise ValueError('War era is missing in the dataset configuration') war_era = dataset_config['warEra'] full_war_era = get_full_war_era(war_era) if full_war_era is None: raise ValueError( 'Invalid war era is specified in the dataset configuration: {0}'. format(war_era)) if 'numberOfPatients' not in dataset_config: raise ValueError( 'Number of patients is missing in the dataset configuration') patients_num = dataset_config['numberOfPatients'] if patients_num <= 0: raise ValueError( "Dataset configuration contains invalid number of patients: {0}". format(patients_num)) # detect male/female ratios, default is 100% males male_ratio = 100 if 'maleRatio' in dataset_config: male_ratio = float(dataset_config['maleRatio']) if 'femaleRatio' in dataset_config and male_ratio + float( dataset_config['femaleRatio']) != 100: raise ValueError( "Both male ({0}) and female ({1}) ratios are specified in the dataset configuration, " "but their sum is not equal to 100".format( dataset_config['maleRatio'], dataset_config['femaleRatio'])) elif 'femaleRatio' in dataset_config: male_ratio = 100 - float(dataset_config['femaleRatio']) if male_ratio < 0 or male_ratio > 100: raise ValueError( "Dataset configuration contains invalid male ratio: {0}".format( male_ratio)) # detect and check end year if 'year' not in dataset_config: raise ValueError( 'End report year is missing in the dataset configuration') end_year = dataset_config['year'] if end_year <= full_war_era['start_date'].year: raise ValueError('End report year must be greater than start date') # setup output directory cur_work_dir = GENERATED_DATASETS_DIR if 'outputFolder' in dataset_config: cur_work_dir = dataset_config['outputFolder'] setup_work_session(cur_work_dir, True, dataset_config['title'], output_format) # retrieve morbidities from configuration or data source morbidities_data = None if 'morbiditiesData' not in dataset_config: war_code = full_war_era['war_code'] try: dataset_config['morbiditiesData'] = get_morbidities_from_war_code( full_war_era['war_code'], include_percentage=True) except EntityNotFoundError: raise ValueError( 'CSV file for war era with code {0} does not exist'.format( war_code)) logger.info( 'Using morbidity probabilities of {0} from configuration file'. format(war_code)) morbidities_data = dataset_config['morbiditiesData'] if ('relatedConditionsData' in dataset_config) and (len( dataset_config['relatedConditionsData']) > 0): morbidities_data.extend(dataset_config['relatedConditionsData']) return generate_records(full_war_era, patients_num, male_ratio, morbidities_data, end_year, output_format)