Ejemplo n.º 1
0
def test_fi1(fi1_df, measure, expected_type):
    values = fi1_df[measure]
    classifier = MeasureClassifier(default_config())
    classifier_report = MeasureClassifier.meta_measures(values)
    measure_type = classifier.classify(classifier_report)

    assert measure_type == expected_type
Ejemplo n.º 2
0
def parse_config(args):
    config = default_config()
    config.verbose = args.verbose
    config.instruments.dir = args.instruments
    config.pedigree = args.pedigree
    config.db.filename = args.output

    skip_columns = set([])
    if args.skip_file:
        assert os.path.exists(args.skip_file)
        with open(args.skip_file, "r") as infile:
            columns = infile.readlines()
            columns = [col.strip() for col in columns]
            skip_columns = skip_columns | set(columns)
    if args.skip_columns:
        columns = set([col for col in args.skip_columns.split(",")])
        skip_columns = skip_columns | columns

    config.skip.measures = skip_columns
    if args.composite_fids:
        config.family.composite_key = args.composite_fids

    if args.role:
        config.person.role.type = args.role
    assert config.person.role.type in set(["column", "guess"])

    if args.role_mapping:
        config.person.role.mapping = args.role_mapping
    assert config.person.role.mapping in set(["SPARK", "SSC", "INTERNAL"])

    if args.person_column:
        config.person.column = args.person_column

    if args.min_individuals is not None and args.min_individuals >= 0:
        config.classification.min_individuals = args.min_individuals

    if args.categorical is not None and args.categorical >= 0:
        config.classification.categorical.min_rank = args.categorical

    if args.ordinal is not None and args.ordinal >= 0:
        config.classification.ordinal.min_rank = args.ordinal

    if args.continuous is not None and args.continuous >= 0:
        config.classification.continuous.min_rank = args.continuous

    if args.tab_separated:
        config.instruments.tab_separated = True

    if args.report_only:
        config.db.filename = "memory"
        config.report_only = args.report_only

    if args.parallel:
        config.parallel = args.parallel

    return config
Ejemplo n.º 3
0
def test_fake_phenotype_data_ordinal_m4(fake_phenotype_data):
    measure_id = "i1.m4"
    df = fake_phenotype_data.get_measure_values_df(measure_id)
    rank = len(df[measure_id].unique())
    assert rank == 9
    assert len(df) == 195

    measure_conf = default_config()
    classifier = MeasureClassifier(measure_conf)
    report = classifier.meta_measures(df[measure_id])
    assert classifier.classify(report) == MeasureType.ordinal
Ejemplo n.º 4
0
def parse_phenotype_data_config(args):
    config = default_config()
    config.verbose = args.verbose
    config.instruments.dir = args.instruments

    config.pedigree = args.pedigree

    config.db.filename = args.pheno_db_filename

    dump_config(config)
    check_phenotype_data_config(config)

    return config
Ejemplo n.º 5
0
def test_should_convert_to_numeric_cutoff():
    values = pd.Series(data=["1", "2", "1", "1", "1", "1", "2", "2", "a"])
    report = MeasureClassifier.meta_measures(values)

    config = default_config()
    config.classification.min_individuals = 1
    config.classification.ordinal.min_rank = 2

    classifier = MeasureClassifier(config)
    measure_type = classifier.classify(report)
    assert measure_type == MeasureType.categorical

    config.classification.non_numeric_cutoff = 0.2
    classifier = MeasureClassifier(config)
    measure_type = classifier.classify(report)
    assert measure_type == MeasureType.ordinal
Ejemplo n.º 6
0
def test_fake_background_classify(fake_background_df):

    columns = list(fake_background_df.columns)
    for col in columns[1:]:
        series = fake_background_df[col]

        classifier = MeasureClassifier(default_config())
        classifier_report = MeasureClassifier.meta_measures(series)
        measure_type = classifier.classify(classifier_report)

        assert (measure_type == MeasureType.text
                or measure_type == MeasureType.raw
                or measure_type == MeasureType.categorical)

        values = classifier.convert_to_string(series.values)
        values = [v for v in values if v is not None]
        assert all([isinstance(v, str) for v in values])
Ejemplo n.º 7
0
def main(argv=None):  # IGNORE:C0111
    """Command line options."""

    if argv is None:
        argv = sys.argv
    else:
        sys.argv.extend(argv)

    program_name = os.path.basename(sys.argv[0])
    program_shortdesc = __import__("__main__").__doc__.split("\n")[1]
    program_license = """%s

USAGE
""" % (program_shortdesc, )

    try:
        defaults = default_config()

        # Setup argument parser
        parser = ArgumentParser(description=program_license)
        # formatter_class=RawDescriptionHelpFormatter
        # formatter_class=ArgumentDefaultsHelpFormatter)
        parser.add_argument("-V",
                            "--verbose",
                            dest="verbose",
                            action="count",
                            help="set verbosity level",
                            default=0)
        parser.add_argument(
            "-i",
            "--instruments",
            dest="instruments",
            help="directory where all instruments are located",
            metavar="path",
        )
        parser.add_argument(
            "-p",
            "--pedigree",
            dest="pedigree",
            help="pedigree file where families descriptions are located",
            metavar="path",
        )
        parser.add_argument(
            "-d",
            "--description",
            help="standardized tsv file that contains measure descriptions",
        )
        parser.add_argument(
            "-o",
            "--output",
            dest="output",
            help="output file",
            metavar="filename",
        )
        parser.add_argument(
            "-C",
            "--continuous",
            type=int,
            dest="continuous",
            default=defaults["classification"]["continuous"]["min_rank"],
            help="minimal count of unique values for a measure to be "
            "classified as continuous (default: %(default)s)",
        )
        parser.add_argument(
            "-O",
            "--ordinal",
            type=int,
            dest="ordinal",
            default=defaults["classification"]["ordinal"]["min_rank"],
            help="minimal count of unique values for a measure to be "
            "classified as ordinal (default: %(default)s)",
        )

        parser.add_argument(
            "-A",
            "--categorical",
            type=int,
            dest="categorical",
            default=defaults["classification"]["categorical"]["min_rank"],
            help="minimal count of unique values for a measure to be "
            "classified as categorical (default: %(default)s)",
        )

        parser.add_argument(
            "-I",
            "--min-individuals",
            type=int,
            dest="min_individuals",
            default=defaults["classification"]["min_individuals"],
            help="minimal number of individuals for a measure to be "
            "considered for classification (default: %(default)s)",
        )

        parser.add_argument(
            "-S",
            "--skip-columns",
            type=str,
            dest="skip_columns",
            help="comma separated list of instruments columns to skip",
        )

        parser.add_argument(
            "--skip-file",
            type=str,
            dest="skip_file",
            help="file with list of instruments columns to skip",
        )

        parser.add_argument(
            "--composite-fids",
            action="store_true",
            dest="composite_fids",
            help="builds composite family IDs from parents' IDs"
            " (default: %(default)s)",
        )

        parser.add_argument(
            "-r",
            "--role",
            dest="role",
            default=defaults["person"]["role"]["type"],
            help='sets role handling; available choices: "column", "guess"'
            " (default: %(default)s)",
        )

        parser.add_argument(
            "--role-mapping",
            dest="role_mapping",
            default=defaults["person"]["role"]["mapping"],
            help="sets role column mapping rules; "
            'available choices "SPARK", "SSC", "INTERNAL"'
            " (default: %(default)s)",
        )

        parser.add_argument(
            "-P",
            "--person-column",
            dest="person_column",
            # default=defaults['person']['role']['column'],
            help="sets name of a column in instrument's files, "
            "containing personId (default: %(default)s)",
        )

        parser.add_argument(
            "-T",
            "--tab-separated",
            dest="tab_separated",
            action="store_true",
            help="instruments file are tab separated"
            " (default: %(default)s)",
        )

        parser.add_argument(
            "--report-only",
            dest="report_only",
            action="store_true",
            help="runs the tool in report only mode (default: %(default)s)",
        )

        parser.add_argument(
            "--parallel",
            type=int,
            dest="parallel",
            default=defaults["parallel"],
            help="size of executors pool to use for processing"
            " (default: %(default)s)",
        )

        # Process arguments
        args = parser.parse_args()

        if args.verbose == 1:
            logging.basicConfig(level=logging.WARNING)
        elif args.verbose == 2:
            logging.basicConfig(level=logging.INFO)
        elif args.verbose >= 3:
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.ERROR)

        if not args.output and not args.report_only:
            raise CLIError("output filename should be specified")

        if not args.output:
            args.output = "output.db"

        if not args.pedigree:
            raise CLIError("pedigree file must be specified")
        if not args.instruments:
            raise CLIError("instruments directory should be specified")

        config = parse_config(args)
        dump_config(config)

        if not check_phenotype_data_config(config):
            raise Exception("bad classification boundaries")

        if os.path.exists(args.output):
            raise CLIError("output file already exists")

        prep = PrepareVariables(config)
        prep.build_pedigree(args.pedigree)
        prep.build_variables(args.instruments, args.description)

        return 0
    except KeyboardInterrupt:
        return 1
    except Exception as e:
        traceback.print_exc()

        indent = len(program_name) * " "
        sys.stderr.write(program_name + ": " + repr(e) + "\n")
        sys.stderr.write(indent + "  for help use --help\n")
        return 2
Ejemplo n.º 8
0
def test_config(temp_dbfile):
    config = default_config()
    config.db.filename = temp_dbfile
    return Box(config.to_dict())