Esempio n. 1
0
def generate_records(study_profile, patients_num, male_perc, morbidities_data, end_year, output_format):

    """
    Generates dataset record files using the specified parameters.
    :param study_profile: the study profile data dictionary (just name of code can be specified)
    :param patients_num: the number of patients
    :param male_perc: the male percentage of patients (between 0 and 100)
    :param morbidities_data: the list with morbidities details
    :param end_year: the end year for generated reports
    :param output_format: the output format
    :param dataset_manager: the dataset manager
    :param work_entity: the work_entity include dir and session
    :return: the total number of created report files
    """
    logger.info("start new Thread " + threading.currentThread().getName())
    logger.info('Creating {0} fictional patient{1}...'.format(patients_num, "s" if patients_num > 1 else ""))

    # first generate required number of random patients
    patients = [random_patient(idx, study_profile, end_year) for idx in range(1, patients_num + 1)]

    set_gender_fields(patients, male_perc)

    for morbidity_data in morbidities_data:
        apply_morbidity(patients, morbidity_data)

    files_num = 0
    patients_len = len(patients)
    for index, patient in enumerate(patients):
        # sort expected problems by diagnosis date to make them appear sequentially in the report
        patient['expected_problems'].sort(key=lambda p: p['onset'])

        # initialize initial morbidities
        populate_current_problems(patient)

        # save initial report for the current patient
        create_file(patient, output_format, work_entity)
        patient_files_num = 1
        initial_problems_num = len(patient['icd_problems'])

        # create and save new reports while patient is alive
        while patient['date_of_death'] is None:
            if not age_patient(patient, end_year):
                break
            create_file(patient, output_format, work_entity)
            patient_files_num += 1

        final_problems_num = len(patient['icd_problems'])
        logger.info("Generated {0} files for {1} patient with {2} problems".format(
            patient_files_num, patient['gender'].lower(),
            "no" if final_problems_num == 0
            else initial_problems_num if initial_problems_num == final_problems_num
            else str(initial_problems_num) + '-' + str(final_problems_num)))
        files_num += patient_files_num
        dataset_manager.update_entity(work_entity[1], 'Generating', index * 100 / patients_len)

    dataset_manager.update_entity(work_entity[1], 'Completed', 100)
    logger.info('Successfully generated {0} report files'.format(files_num))
    logger.info('Thread run successfully, name = ' + threading.currentThread().getName())
    return files_num
Esempio n. 2
0
def preload_all_configurations():
    """
    preload all configurations into cache
    :return: None
    """
    files = [
        f for f in listdir(DATASET_CONFIGURATIONS_DIR)
        if isfile(join(DATASET_CONFIGURATIONS_DIR, f))
    ]
    for file in files:
        configuration = read_configuration_from_file(
            DATASET_CONFIGURATIONS_DIR + '/' + file)
        get_cache_map()[configuration['title']] = configuration
        logger.info("succeed load configuration = " + configuration['title'])
 def update_entity(self, name, status, progress):
     """
     update dataset entity by name
     :param name:  the dataset name
     :param status: the dataset status
     :param progress: the dataset progress
     :return: None
     """
     entity = self.cache_map.get(name)
     if entity is not None:
         entity['status'] = status
         entity['progress'] = progress
         logger.info("%s status = %s, progress = %.2f%%" %
                     (name, status, progress))
         if status == DATASET_COMPLETED and progress >= 100:
             entity['completedOn'] = datetime.now().isoformat()
def preload_datasets():
    """
    preload all datasets into manager
    Get all dataset by scan output folder, if folder name start begin DATASET_PREFIX, that's mean this dataset generate
    by rest api
    If get configuration by title failed, then this api will skip the dataset
    :return: None
    """
    datasets_folders = [
        f for f in listdir(GENERATED_DATASETS_DIR)
        if isdir(join(GENERATED_DATASETS_DIR, f))
    ]
    for dataset_name in datasets_folders:
        if not dataset_name.startswith(
                DATASET_PREFIX):  # not generate by rest api
            continue
        dataset_parts = dataset_name.split('.')
        name = len(
            dataset_parts) > 1 and dataset_parts[1] or 'ERROR TO GET NAME'
        output_format = len(dataset_parts) > 3 and dataset_parts[3] or 'CCDA'
        try:
            configurations = dataset_configuration_service.get_configuration_by_title(
                name)
            dataset = {
                'title':
                name,
                'completedOn':
                datetime.fromtimestamp(
                    stat(join(GENERATED_DATASETS_DIR,
                              dataset_name)).st_mtime).isoformat(),
                'configuration':
                configurations,
                'status':
                DATASET_COMPLETED,
                'progress':
                100,
                'outputFormat':
                output_format,
                'datasetName':
                dataset_name
            }
            dataset_manager.push_entity(dataset_name, dataset)
            logger.info("succeed load dataset = " + dataset_name)
        except Exception as e:
            # if get configuration error, then skip this dataset, so we don't need raise error here
            logger.error(e)
Esempio n. 5
0
def setup_work_session(output_dir,
                       create_session_path=True,
                       config_title=None,
                       output_format='CCDA'):
    """
    Create a unique work folder for the current session
    :param output_dir: the output directory for generated dataset files
    :param create_session_path: True if session directory should be created, False otherwise
    :param config_title: the config title used
    :param output_format: the output format
    :return: None
    """
    global work_dir
    global session_id

    # generate new session ID from the current timestamp
    session_id = config_title and '{0}.{1}.{2}.{3}'.format(
        DATASET_PREFIX, config_title,
        datetime.datetime.now().strftime("%Y%m%d%H%M%S"),
        output_format) or datetime.datetime.now().isoformat().replace(':', '')
    work_dir = output_dir

    # load data sources if they haven't been loaded previously
    if not data_source:
        load_datasources()

    if not os.path.exists(work_dir):
        os.makedirs(work_dir)

    if create_session_path:
        # create subfolder for the current session
        session_path = "%(work_dir)s/%(session_id)s" % globals()
        # try to use short relative to the current directory path if possible
        try:
            session_path = os.path.relpath(session_path)
        except ValueError:
            pass

        if os.path.exists(session_path):
            shutil.rmtree(session_path)
        os.mkdir(session_path)
        logger.info("Using output folder " + session_path)
Esempio n. 6
0
def generate_records(war_era, patients_num, male_perc, morbidities_data,
                     end_year, output_format):
    """
    Generates dataset record files using the specified parameters.
    :param war_era: the war era data dictionary (just name of code can be specified)
    :param patients_num: the number of patients
    :param male_perc: the male percentage of patients (between 0 and 100)
    :param morbidities_data: the list with morbidities details
    :param end_year: the end year for generated reports
    :param output_format: the output format
    :return: the total number of created report files
    """
    logger.info('Creating {0} fictional patient{1}...'.format(
        patients_num, "s" if patients_num > 1 else ""))

    # first generate required number of random patients
    patients = [
        random_patient(idx, war_era, end_year)
        for idx in range(1, patients_num + 1)
    ]

    set_gender_fields(patients, male_perc)

    for morbidity_data in morbidities_data:
        apply_morbidity(patients, morbidity_data)

    files_num = 0

    for patient in patients:
        # sort expected problems by diagnosis date to make them appear sequentially in the report
        patient['expected_problems'].sort(key=lambda p: p['onset'])

        # initialize initial morbidities
        populate_current_problems(patient)

        # save initial report for the current patient
        create_file(patient, output_format)
        patient_files_num = 1
        initial_problems_num = len(patient['icd_problems'])

        # create and save new reports while patient is alive
        while patient['date_of_death'] is None:
            if not age_patient(patient, end_year):
                break
            create_file(patient, output_format)
            patient_files_num += 1

        final_problems_num = len(patient['icd_problems'])
        logger.info(
            "Generated {0} files for {1} patient with {2} problems".format(
                patient_files_num, patient['gender'].lower(),
                "no" if final_problems_num == 0 else
                initial_problems_num if initial_problems_num
                == final_problems_num else str(initial_problems_num) + '-' +
                str(final_problems_num)))
        files_num += patient_files_num

    logger.info('Successfully generated {0} report files'.format(files_num))

    return files_num
Esempio n. 7
0
def load_icd10_codes():
    """
    Read icd10cm_codes_2018.txt file, extract code/name pairs of known morbidities from it.
    The result is saved to the global icd_morbidity_name_by_code variable.
    :return: None
    """
    global icd_morbidity_name_by_code
    icd_morbidity_name_by_code = {}

    # load the ICD-10 datasource
    try:
        logger.info('Reading ICD-10 datasource...')
        lines_num = 0
        for line in open(ICD_10_CODES_FILE_PATH):
            code = line[:8].strip()
            name = line[8:].strip()
            icd_morbidity_name_by_code[code] = name
            lines_num += 1
        logger.info('Loaded {0} records from ICD-10 datasource'.format(lines_num))
    except Exception as e:
        logger.error('Could not open {0}. Error: {1}'.format(ICD_10_CODES_FILE_PATH, e))
        raise
Esempio n. 8
0
def generate_from_config(dataset_config, dataset_manager):
    """
    Generate dataset records using the provided dataset configuration parameters and save them to files
    :param dataset_config: the dataset configuration dictionary
    :param dataset_manager: the dataset manager
    :return: None
    """

    output_format = dataset_config.get('outputFormat', 'CCDA')
    if 'studyProfile' not in dataset_config:
        raise ValueError('Study profile is missing in the dataset configuration')
    study_profile = dataset_config['studyProfile']
    full_study_profile = get_full_study_profile(study_profile)
    if full_study_profile is None:
        raise ValueError('Invalid study profile is specified in the dataset configuration: {0}'.format(study_profile))

    if 'numberOfPatients' not in dataset_config:
        raise ValueError('Number of patients is missing in the dataset configuration')
    patients_num = dataset_config['numberOfPatients']
    if patients_num <= 0:
        raise ValueError("Dataset configuration contains invalid number of patients: {0}".format(patients_num))

    # detect male/female ratios, default is 100% males
    male_ratio = 100
    if 'maleRatio' in dataset_config:
        male_ratio = float(dataset_config['maleRatio'])
        if 'femaleRatio' in dataset_config and male_ratio + float(dataset_config['femaleRatio']) != 100:
            raise ValueError("Both male ({0}) and female ({1}) ratios are specified in the dataset configuration, "
                             "but their sum is not equal to 100".format(dataset_config['maleRatio'],
                                                                        dataset_config['femaleRatio']))
    elif 'femaleRatio' in dataset_config:
        male_ratio = 100 - float(dataset_config['femaleRatio'])

    if male_ratio < 0 or male_ratio > 100:
        raise ValueError("Dataset configuration contains invalid male ratio: {0}".format(male_ratio))

    # detect and check end year
    if 'year' not in dataset_config:
        raise ValueError('End report year is missing in the dataset configuration')
    end_year = dataset_config['year']
    if end_year <= full_study_profile['start_date'].year:
        raise ValueError('End report year must be greater than start date')

    # setup output directory
    cur_work_dir = GENERATED_DATASETS_DIR
    if 'outputFolder' in dataset_config:
        cur_work_dir = dataset_config['outputFolder']
    work_entity = setup_work_session(cur_work_dir, True, dataset_config['title'], output_format)

    # retrieve morbidities from configuration or data source
    morbidities_data = None
    if 'morbiditiesData' not in dataset_config:
        study_profile_code = full_study_profile['study_profile_code']
        try:
            dataset_config['morbiditiesData'] = get_morbidities_from_study_profile_code(full_study_profile['study_profile_code'],
                                                                                        include_percentage=True)
        except EntityNotFoundError:
            raise ValueError('CSV file for study profile with code {0} does not exist'.format(study_profile_code))
        logger.info('Using morbidity probabilities of {0} from configuration file'.format(study_profile_code))
    morbidities_data = dataset_config['morbiditiesData']

    if ('relatedConditionsData' in dataset_config) and (len(dataset_config['relatedConditionsData']) > 0):
        morbidities_data.extend(dataset_config['relatedConditionsData'])


    # push new entity
    dataset_manager.push_entity(work_entity[1], {
        'title': dataset_config['title'],
        'completedOn': 'N/A',
        'configuration': dataset_config,
        'status': 'Generating',
        'progress': 0,
        'outputFormat': output_format,
        'datasetName': work_entity[1]
    })
    # start new thread
    generate_thread = threading.Thread(target=generate_records,
                                       args=(full_study_profile, patients_num, male_ratio,
                                             morbidities_data, end_year,
                                             output_format,
                                             dataset_manager, work_entity,))
    generate_thread.start()
Esempio n. 9
0
def frontend_app_folder(filename):
    """
    Serve frontend folder resource and inject frontend url state
    :param filename:
    :return: the resource content
    """
    if not os.path.exists(os.path.join(FRONTEND_DIR, filename)):
        return frontend_app_index()

    return send_from_directory(FRONTEND_DIR, filename)


if __name__ == '__main__':
    # start flask app

    logger.info('Checking output directories...')

    # check and create dirs
    if not os.path.exists(DATASET_CONFIGURATIONS_DIR):
        os.makedirs(DATASET_CONFIGURATIONS_DIR)
    if not os.path.exists(GENERATED_DATASETS_DIR):
        os.makedirs(GENERATED_DATASETS_DIR)

    # init randomizer
    setup_work_session(GENERATED_DATASETS_DIR, create_session_path=False)
    logger.info('Starting app at port = {0}, with mode = {1}'.format(
        WEB_PORT, FLASK_RUN_MODE))
    # inject routers
    init(app)
    app.run(debug=(FLASK_RUN_MODE == 'DEBUG'),
            port=int(WEB_PORT),
Esempio n. 10
0
def generate_from_config(dataset_config):
    """
    Generate dataset records using the provided dataset configuration parameters and save them to files
    :param dataset_config: the dataset configuration dictionary
    :return: None
    """

    output_format = dataset_config.get('outputFormat', 'CCDA')
    if 'warEra' not in dataset_config:
        raise ValueError('War era is missing in the dataset configuration')
    war_era = dataset_config['warEra']
    full_war_era = get_full_war_era(war_era)
    if full_war_era is None:
        raise ValueError(
            'Invalid war era is specified in the dataset configuration: {0}'.
            format(war_era))

    if 'numberOfPatients' not in dataset_config:
        raise ValueError(
            'Number of patients is missing in the dataset configuration')
    patients_num = dataset_config['numberOfPatients']
    if patients_num <= 0:
        raise ValueError(
            "Dataset configuration contains invalid number of patients: {0}".
            format(patients_num))

    # detect male/female ratios, default is 100% males
    male_ratio = 100
    if 'maleRatio' in dataset_config:
        male_ratio = float(dataset_config['maleRatio'])
        if 'femaleRatio' in dataset_config and male_ratio + float(
                dataset_config['femaleRatio']) != 100:
            raise ValueError(
                "Both male ({0}) and female ({1}) ratios are specified in the dataset configuration, "
                "but their sum is not equal to 100".format(
                    dataset_config['maleRatio'],
                    dataset_config['femaleRatio']))
    elif 'femaleRatio' in dataset_config:
        male_ratio = 100 - float(dataset_config['femaleRatio'])

    if male_ratio < 0 or male_ratio > 100:
        raise ValueError(
            "Dataset configuration contains invalid male ratio: {0}".format(
                male_ratio))

    # detect and check end year
    if 'year' not in dataset_config:
        raise ValueError(
            'End report year is missing in the dataset configuration')
    end_year = dataset_config['year']
    if end_year <= full_war_era['start_date'].year:
        raise ValueError('End report year must be greater than start date')

    # setup output directory
    cur_work_dir = GENERATED_DATASETS_DIR
    if 'outputFolder' in dataset_config:
        cur_work_dir = dataset_config['outputFolder']
    setup_work_session(cur_work_dir, True, dataset_config['title'],
                       output_format)

    # retrieve morbidities from configuration or data source
    morbidities_data = None
    if 'morbiditiesData' not in dataset_config:
        war_code = full_war_era['war_code']
        try:
            dataset_config['morbiditiesData'] = get_morbidities_from_war_code(
                full_war_era['war_code'], include_percentage=True)
        except EntityNotFoundError:
            raise ValueError(
                'CSV file for war era with code {0} does not exist'.format(
                    war_code))
        logger.info(
            'Using morbidity probabilities of {0} from configuration file'.
            format(war_code))
    morbidities_data = dataset_config['morbiditiesData']

    if ('relatedConditionsData' in dataset_config) and (len(
            dataset_config['relatedConditionsData']) > 0):
        morbidities_data.extend(dataset_config['relatedConditionsData'])

    return generate_records(full_war_era, patients_num, male_ratio,
                            morbidities_data, end_year, output_format)