def generate_csv_files(args, data_handler):
    """
    Generate the CSV file used to call the R script.

    :param Namespace args:
    :param DataHandler data_handler:
    """
    assert isinstance(args, argparse.Namespace)
    assert isinstance(data_handler, DataHandler)

    biomarkers = data_handler.get_biomarker_names()
    measurements = data_handler.get_measurements_as_dict(min_visits=args.min_visits,
                                                         select_training_set=True,
                                                         exclude_deceased=args.exclude_deceased)
    for biomarker in biomarkers:
        print log.INFO, 'Generating output CSV for {0}...'.format(biomarker)
        samples_file = data_handler.get_samples_file(biomarker)
        writer = csv.writer(open(samples_file, 'wb'), delimiter=',')
        writer.writerow(['rid', 'progress', 'value', 'diagnosis'])

        subjects = set()
        num_samples = 0
        for rid, visits in measurements.items():
            for _, visit_data in visits.items():
                try:
                    progress = DataHandler.safe_cast(visit_data['progress'], int)
                    value = DataHandler.safe_cast(visit_data[biomarker], float)
                    diagnosis = DataHandler.safe_cast(visit_data['DX.scan'], float)
                    if progress is not None and value is not None:
                        writer.writerow([rid, progress, value, diagnosis])
                        subjects.add(rid)
                        num_samples += 1
                except KeyError:
                    pass
        print log.RESULT, 'Collected {0} samples from {1} subjects.'.format(num_samples, len(subjects))
def print_training_samples_statistics(args, data_handler):
    biomarkers = data_handler.get_biomarker_names()
    measurements = data_handler.get_measurements_as_dict(min_visits=args.min_visits,
                                                         select_training_set=True,
                                                         no_regression=True)
    for biomarker in biomarkers:
        subjects = set()
        num_samples = 0
        for rid, visits in measurements.items():
            for _, visit_data in visits.items():
                try:
                    progress = DataHandler.safe_cast(visit_data['progress'], int)
                    value = DataHandler.safe_cast(visit_data[biomarker], float)
                    if progress is not None and value is not None:
                        subjects.add(rid)
                        num_samples += 1
                except KeyError:
                    pass

        print log.RESULT, 'Biomarker {0}: collected {1} samples from {2} subjects.'.format(biomarker, num_samples, len(subjects))
def print_training_samples_statistics(args, data_handler):
    biomarkers = data_handler.get_biomarker_names()
    measurements = data_handler.get_measurements_as_dict(
        min_visits=args.min_visits,
        select_training_set=True,
        no_regression=True)
    for biomarker in biomarkers:
        subjects = set()
        num_samples = 0
        for rid, visits in measurements.items():
            for _, visit_data in visits.items():
                try:
                    progress = DataHandler.safe_cast(visit_data['progress'],
                                                     int)
                    value = DataHandler.safe_cast(visit_data[biomarker], float)
                    if progress is not None and value is not None:
                        subjects.add(rid)
                        num_samples += 1
                except KeyError:
                    pass

        print log.RESULT, 'Biomarker {0}: collected {1} samples from {2} subjects.'.format(
            biomarker, num_samples, len(subjects))