Exemple #1
0
def test_fi1(fi1_df, measure, expected_type):
    values = fi1_df[measure]
    classifier = MeasureClassifier(default_config())
    classifier_report = MeasureClassifier.meta_measures(values)
    measure_type = classifier.classify(classifier_report)

    assert measure_type == expected_type
def test_classifier_all_bool():
    values = pd.Series(data=[True, False, True])
    res = MeasureClassifier.meta_measures(values)
    assert res.count_with_values == 3
    assert res.count_without_values == 0
    assert res.count_with_numeric_values == 0
    assert res.count_with_non_numeric_values == 3

    res = MeasureClassifier.convert_to_numeric(values)
    assert res.dtype == np.float64
def test_classifier_all_float_again():
    values = pd.Series(data=[3.3, 1, 2.2, 3.3, 1, 1])
    res = MeasureClassifier.meta_measures(values)
    assert res.count_with_values == 6
    assert res.count_without_values == 0
    assert res.count_with_numeric_values == 6
    assert res.count_with_non_numeric_values == 0

    res = MeasureClassifier.convert_to_numeric(values)
    print(res)
    print(res.dtype)
Exemple #4
0
def test_fake_phenotype_data_ordinal_m4(fake_phenotype_data):
    measure_id = "i1.m4"
    df = fake_phenotype_data.get_measure_values_df(measure_id)
    rank = len(df[measure_id].unique())
    assert rank == 9
    assert len(df) == 195

    measure_conf = default_config()
    classifier = MeasureClassifier(measure_conf)
    report = classifier.meta_measures(df[measure_id])
    assert classifier.classify(report) == MeasureType.ordinal
def test_classifier_bool_and_nan():
    values = pd.Series(data=[True, False, True, np.nan, None, " "])
    res = MeasureClassifier.meta_measures(values)
    print(res)
    assert res.count_with_values == 3
    assert res.count_without_values == 3
    assert res.count_with_numeric_values == 0
    assert res.count_with_non_numeric_values == 3

    res = MeasureClassifier.convert_to_numeric(values)
    print(res)
    print(res.dtype)
def test_classifier_float():
    values = pd.Series(data=[" ", None, np.nan, 1, 2.2])
    res = MeasureClassifier.meta_measures(values)
    print(res)

    assert res.count_with_values == 2
    assert res.count_without_values == 3
    assert res.count_with_numeric_values == 2
    assert res.count_with_non_numeric_values == 0

    res = MeasureClassifier.convert_to_numeric(values)
    print(res)
    print(res.dtype)
def test_classifier_non_numeric():
    values = pd.Series(data=["1", "2", "3", "4.4", "a"])

    res = MeasureClassifier.meta_measures(values)
    print(res)

    assert res.count_with_values == 5
    assert res.count_without_values == 0
    assert res.count_with_numeric_values == 4
    assert res.count_with_non_numeric_values == 1

    res = MeasureClassifier.convert_to_numeric(values)
    print(res)
    print(res.dtype)
Exemple #8
0
 def run(self):
     try:
         logging.info("classifying measure {self.measure.measure_id}")
         values = self.mdf["value"]
         classifier = MeasureClassifier(self.config)
         self.classifier_report = classifier.meta_measures(values)
         self.measure.individuals = self.classifier_report.count_with_values
         self.measure.measure_type = classifier.classify(
             self.classifier_report)
         self.build_meta_measure()
     except Exception:
         logger.exception(
             f"problem processing measure: {self.measure.measure_id}")
     return self
def test_fi1(fi1_df):
    for col in fi1_df:
        report = MeasureClassifier.meta_measures(fi1_df[col])
        assert (report.count_with_values == report.count_with_numeric_values +
                report.count_with_non_numeric_values)
        assert (report.count_total == report.count_with_values +
                report.count_without_values)
Exemple #10
0
def test_fake_background_classify(fake_background_df):

    columns = list(fake_background_df.columns)
    for col in columns[1:]:
        series = fake_background_df[col]

        classifier = MeasureClassifier(default_config())
        classifier_report = MeasureClassifier.meta_measures(series)
        measure_type = classifier.classify(classifier_report)

        assert (measure_type == MeasureType.text
                or measure_type == MeasureType.raw
                or measure_type == MeasureType.categorical)

        values = classifier.convert_to_string(series.values)
        values = [v for v in values if v is not None]
        assert all([isinstance(v, str) for v in values])
def test_clasify_minus_values():
    values = pd.Series(data=[
        "-",
        "-",
        "-",
        np.nan,
        None,
        " ",
        "-",
    ])
    print(values)

    report = MeasureClassifier.meta_measures(values)
    assert report.count_with_numeric_values == 0
    assert report.count_without_values == 3
    assert report.count_with_non_numeric_values == 4
def test_should_convert_to_numeric_cutoff():
    values = pd.Series(data=["1", "2", "1", "1", "1", "1", "2", "2", "a"])
    report = MeasureClassifier.meta_measures(values)

    config = default_config()
    config.classification.min_individuals = 1
    config.classification.ordinal.min_rank = 2

    classifier = MeasureClassifier(config)
    measure_type = classifier.classify(report)
    assert measure_type == MeasureType.categorical

    config.classification.non_numeric_cutoff = 0.2
    classifier = MeasureClassifier(config)
    measure_type = classifier.classify(report)
    assert measure_type == MeasureType.ordinal
Exemple #13
0
    def __init__(self, config):
        super(PreparePersons, self).__init__(config)
        self.sample_ids = None
        self.classifier = MeasureClassifier(config)

        self.pool = Pool(processes=self.config.parallel)
Exemple #14
0
class PrepareVariables(PreparePersons):
    def __init__(self, config):
        super(PreparePersons, self).__init__(config)
        self.sample_ids = None
        self.classifier = MeasureClassifier(config)

        self.pool = Pool(processes=self.config.parallel)

    def _get_person_column_name(self, df):
        if self.config.person.column:
            person_id = self.config.person.column
        else:
            person_id = df.columns[0]
        logger.debug(f"Person ID: {person_id}")
        return person_id

    def load_instrument(self, instrument_name, filenames):
        assert filenames
        assert all([os.path.exists(f) for f in filenames])

        # instrument_names = [
        #     os.path.splitext(os.path.basename(f))[0] for f in filenames
        # ]
        # assert len(set(instrument_names)) == 1
        # assert instrument_name == instrument_names[0]

        dataframes = []
        sep = ","

        if self.config.instruments.tab_separated:
            sep = "\t"

        for filename in filenames:
            logger.info(f"reading instrument: {filename}")
            df = pd.read_csv(filename,
                             sep=sep,
                             low_memory=False,
                             encoding="ISO-8859-1")
            person_id = self._get_person_column_name(df)
            logging.info(
                f"renaming column '{person_id}' to '{self.PERSON_ID}' "
                f"in instrument: {instrument_name}")

            df = df.rename(columns={person_id: self.PERSON_ID})
            dataframes.append(df)
        assert len(dataframes) >= 1

        if len(dataframes) == 1:
            df = dataframes[0]
        else:
            assert len(set([len(f.columns) for f in dataframes])) == 1
            df = pd.concat(dataframes, ignore_index=True)

        assert df is not None
        if len(df) == 0:
            return df

        df = self._augment_person_ids(df)
        df = self._adjust_instrument_measure_names(instrument_name, df)
        return df

    def _adjust_instrument_measure_names(self, instrument_name, df):
        if len(df) == 0:
            return df

        columns = {}
        for index in range(1, len(df.columns)):
            name = df.columns[index]
            parts = [p.strip() for p in name.split(".")]
            parts = [p for p in parts if p != instrument_name]
            columns[name] = ".".join(parts)
        df.rename(columns=columns, inplace=True)
        return df

    @property
    def log_filename(self):
        db_filename = self.config.db.filename
        if self.config.report_only:
            filename = self.config.report_only
            assert db_filename == "memory"
            return filename
        else:
            filename, _ext = os.path.splitext(db_filename)
            filename = filename + "_report_log.tsv"
            return filename

    def log_header(self):
        with open(self.log_filename, "w") as log:
            log.write(ClassifierReport.header_line())
            log.write("\n")

    def log_measure(self, measure, classifier_report):
        classifier_report.set_measure(measure)
        logging.info(classifier_report.log_line(short=True))

        with open(self.log_filename, "a") as log:
            log.write(classifier_report.log_line())
            log.write("\n")

    def save_measure(self, measure):
        to_save = measure.to_dict()
        assert "db_id" not in to_save, to_save
        ins = self.db.measure.insert().values(**to_save)
        with self.db.engine.begin() as connection:
            result = connection.execute(ins)
            measure_id = result.inserted_primary_key[0]

        return measure_id

    def save_measure_values(self, measure, values):
        if len(values) == 0:
            logging.warning(
                f"skiping measure {measure.measure_id} without values")
            return
        logging.info(
            f"saving measure {measure.measure_id} values {len(values)}")
        value_table = self.db.get_value_table(measure.measure_type)
        ins = value_table.insert()

        with self.db.engine.begin() as connection:
            connection.execute(ins, list(values.values()))

    def _collect_instruments(self, dirname):
        regexp = re.compile("(?P<instrument>.*)(?P<ext>\\.csv.*)")
        instruments = defaultdict(list)
        for root, _dirnames, filenames in os.walk(dirname):
            for filename in filenames:
                basename = os.path.basename(filename)
                basename = basename.lower()
                res = regexp.match(basename)
                if not res:
                    logger.debug(f"filename {basename} is not an instrument; "
                                 f"skipping...")
                    continue
                logger.debug(f"instrument matched: {res.group('instrument')}; "
                             f"file extension: {res.group('ext')}")
                instruments[res.group("instrument")].append(
                    os.path.abspath(os.path.join(root, filename)))
        return instruments

    def build_variables(self, instruments_dirname, description_path):
        self.log_header()

        self.build_pheno_common()

        instruments = self._collect_instruments(instruments_dirname)
        descriptions = PrepareVariables.load_descriptions(description_path)
        for instrument_name, instrument_filenames in list(instruments.items()):
            assert instrument_name is not None
            df = self.load_instrument(instrument_name, instrument_filenames)
            if len(df) == 0:
                logger.info(f"instrument {instrument_name} is empty; skipping")
                continue
            self.build_instrument(instrument_name, df, descriptions)

    def _augment_person_ids(self, df):
        persons = self.get_persons()
        pid = pd.Series(df.index)
        for index, row in df.iterrows():
            p = persons.get(row[self.PERSON_ID])
            if p is None:
                pid[index] = np.nan
                logging.info(
                    f"measure for missing person: {row[self.PERSON_ID]}")
            else:
                assert p is not None
                assert p.person_id == row[self.PERSON_ID]
                pid[index] = p.id

        df[self.PID_COLUMN] = pid
        if len(df) > 0:
            df = df[np.logical_not(np.isnan(df[self.PID_COLUMN]))].copy()
        return df

    def build_pheno_common(self):
        pheno_common_measures = set(self.pedigree_df.columns) - (set(
            self.PED_COLUMNS_REQUIRED) | set(["sampleId", "role"]))

        df = self.pedigree_df.copy(deep=True)
        df.rename(columns={"personId": self.PERSON_ID}, inplace=True)
        assert self.PERSON_ID in df.columns
        df = self._augment_person_ids(df)

        pheno_common_columns = [
            self.PERSON_ID,
            self.PID_COLUMN,
        ]
        pheno_common_columns.extend(pheno_common_measures)
        self.build_instrument("pheno_common", df[pheno_common_columns])

    def build_instrument(self, instrument_name, df, descriptions=None):

        assert df is not None
        assert self.PERSON_ID in df.columns

        classify_queue = TaskQueue()
        save_queue = TaskQueue()

        for measure_name in df.columns:
            if (measure_name == self.PID_COLUMN
                    or measure_name == self.PERSON_ID):
                continue

            if descriptions:
                measure_desc = descriptions(instrument_name, measure_name)
            else:
                measure_desc = None

            classify_task = ClassifyMeasureTask(self.config, instrument_name,
                                                measure_name, measure_desc, df)
            res = self.pool.apply_async(classify_task)
            classify_queue.put(res)
        while not classify_queue.empty():
            res = classify_queue.get()
            task = res.get()
            measure, classifier_report, _mdf = task.done()
            self.log_measure(measure, classifier_report)
            if measure.measure_type == MeasureType.skipped:
                logging.info(
                    f"skip saving measure: {measure.measure_id}; "
                    f"measurings: {classifier_report.count_with_values}")
                continue
            save_queue.put(task)

        if self.config.report_only:
            return

        values_queue = TaskQueue()
        while not save_queue.empty():
            task = save_queue.get()
            measure, classifier_report, mdf = task.done()

            measure_id = self.save_measure(measure)
            measure.db_id = measure_id
            values_task = MeasureValuesTask(measure, mdf)
            res = self.pool.apply_async(values_task)
            values_queue.put(res)

        while not values_queue.empty():
            res = values_queue.get()
            values_task = res.get()
            measure, values = values_task.done()
            self.save_measure_values(measure, values)

        return df

    @staticmethod
    def create_default_measure(instrument_name, measure_name):
        measure = {
            "measure_type": MeasureType.other,
            "measure_name": measure_name,
            "instrument_name": instrument_name,
            "measure_id": "{}.{}".format(instrument_name, measure_name),
            "individuals": None,
            "default_filter": None,
        }
        measure = Box(measure)
        return measure

    def classify_measure(self, instrument_name, measure_name, df):
        measure = self.create_default_measure(instrument_name, measure_name)
        values = df["value"]

        classifier_report = self.classifier.meta_measures(values)
        measure.individuals = classifier_report.count_with_values
        measure.measure_type = self.classifier.classify(classifier_report)

        return classifier_report, measure

    @staticmethod
    def load_descriptions(description_path):
        if not description_path:
            return None
        assert os.path.exists(
            os.path.abspath(description_path)), description_path

        df = pd.read_csv(description_path, sep="\t")

        class DescriptionDf:
            def __init__(self, desc_df):
                self.desc_df = desc_df
                assert all([
                    col in list(desc_df) for col in [
                        "instrumentName",
                        "measureName",
                        "measureId",
                        "description",
                    ]
                ]), list(desc_df)

            def __call__(self, iname, mname):
                if ("{}.{}".format(iname, mname)
                        not in self.desc_df["measureId"].values):
                    return None
                row = self.desc_df.query(("(instrumentName == @iname) and "
                                          "(measureName == @mname)"))
                return row.iloc[0]["description"]

        return DescriptionDf(df)