def get_reduced_dataframe_from_saved_classifier() -> pd.DataFrame:
        """
        Returns reduced dataframe given that a classifier is stored as classifier.sav
        """
        clf = FeatureExtractor.get_saved_classifier()

        selector = SelectFromModel(clf, prefit=True)

        cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve()
        assert cache.is_dir()

        from idiva.fextr import align

        with open(str(cache) + "/control_v2.vcf") as ctrl_vcf:
            ctrl_reader = ReadVCF(ctrl_vcf)
            with open(str(cache) + "/case_processed_v2.vcf") as case_vcf:
                case_reader = ReadVCF(case_vcf)
                dataframe = align(ctrl=ctrl_reader, case=case_reader)
                id = dataframe.index

        dataframe['ID'] = dataframe.ID_case.combine_first(dataframe.ID_ctrl)

        dataframe = dataframe[['CHROM', 'POS', 'ID', 'REF', 'ALT']]

        extracted = id[selector.get_support()].values

        return dataframe.loc[extracted]
    def feature_extraction_chunks(self, ctrl_vcf_file: str, case_vcf_file: str):
        """
        Returns a fitted Perceptron classifier for the given vcf files
        The classifier is trained in chunks where the chunks consist of a range of patient
        Therefore the classifier iterates columnwise over the vcf files
        The files are divided into equally many chunks and therefore the individual chunksize can differ
        """
        log.info("Fit linear classifier and reduce number of variants")

        clf = Perceptron()

        cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve()
        assert cache.is_dir()

        # create unique index
        id = None

        with open(str(cache) + "/" + ctrl_vcf_file) as ctrl_vcf:
            ctrl_reader = ReadVCF(ctrl_vcf)
            with open(str(cache) + "/" + case_vcf_file) as case_vcf:
                case_reader = ReadVCF(case_vcf)
                dataframe = align(ctrl=ctrl_reader, case=case_reader)
                id = dataframe.index

        with open(str(cache) + "/" + ctrl_vcf_file) as ctrl_vcf:
            with open(str(cache) + "/" + case_vcf_file) as case_vcf:
                reader_ctrl = ReadVCF(ctrl_vcf)
                reader_case = ReadVCF(case_vcf)

                header_ctrl = reader_ctrl.header

                header_case = reader_case.header

                exclude = [2, 3, 5, 6, 7, 8]

                names_ctrl = [i for idx, i in enumerate(header_ctrl) if idx not in exclude]
                names_case = [i for idx, i in enumerate(header_case) if idx not in exclude]

                len_ctrl = len(header_ctrl) - 9
                len_case = len(header_case) - 9

                min_batch_size = min([len_ctrl, len_case, 50])

                number_of_batches = int(max([np.ceil(len_ctrl / min_batch_size), np.ceil(len_case / min_batch_size)]))

                batch_size_ctrl = int(np.ceil(len_ctrl / number_of_batches))
                batch_size_case = int(np.ceil(len_case / number_of_batches))

                batches_ctrl = [i * batch_size_ctrl for i in range(number_of_batches)]
                batches_case = [i * batch_size_case for i in range(number_of_batches)]

                batches_ctrl.append(len_ctrl)
                batches_case.append(len_case)

                for idx in tqdm(range(number_of_batches), total=number_of_batches, postfix='feature selection'):
                    clf = self.feature_extraction_batch(reader_ctrl, reader_case, names_ctrl, names_case,
                                                        batches_ctrl, batches_case, idx, clf, id)

        return clf, id
    def get_reduced_dataframe(self) -> pd.DataFrame:
        """
        Returns reduced dataframe
        """
        cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve()
        assert cache.is_dir()

        from idiva.fextr import align

        with open(str(cache) + "/control_v2.vcf") as ctrl_vcf:
            ctrl_reader = ReadVCF(ctrl_vcf)
            with open(str(cache) + "/case_processed_v2.vcf") as case_vcf:
                case_reader = ReadVCF(case_vcf)
                dataframe = align(ctrl=ctrl_reader, case=case_reader)

        dataframe['ID'] = dataframe.ID_case.combine_first(dataframe.ID_ctrl)

        dataframe = dataframe[['CHROM', 'POS', 'ID', 'REF', 'ALT']]

        extracted = self.get_extracted_variants().values

        return dataframe.loc[extracted]
Ejemplo n.º 4
0
    def get_reduced_dataframe(self, *, case_vcf, ctrl_vcf) -> pd.DataFrame:
        """
        Returns reduced dataframe
        """
        cache = (Path(__file__).parent.parent.parent.parent /
                 "input/download_cache").resolve()
        assert cache.is_dir()

        from idiva.fextr import align

        with ctrl_vcf.rewind_when_done:
            ctrl_reader = ctrl_vcf
            with case_vcf.rewind_when_done:
                case_reader = case_vcf
                dataframe = align(ctrl=ctrl_reader, case=case_reader)
                id = dataframe.index

        dataframe['ID'] = dataframe.ID_case.combine_first(dataframe.ID_ctrl)

        dataframe = dataframe[['CHROM', 'POS', 'ID', 'REF', 'ALT']]

        extracted = self.get_extracted_variants().values

        return dataframe.loc[extracted]