Beispiel #1
0
 def __init__(self, method='', dir_name='temp', is_clean=True):
     self.prefix = 'texts/'
     self.arff = Parse_ARFF()
     self.dir_name = dir_name
     if is_clean: self.__clean_dir(self.prefix + self.dir_name)
     self.n_folds = 5
     self.classes = [0, 1]
     self.method_prev = self._bin_prevalence  #._bin_prevalence or ._multi_prevalence
     self.model = self.__classificator(class_weight='auto')
     if method == 'EM' or method == 'EM1' or method == 'Iter' or method == 'Iter1':
         self.method = method
     elif method == 'PCC' or method == 'CC' or method == 'ACC' or method == 'PACC':
         self.method = method
     elif method == 'test':
         self.method = method
         self._train_file, self._test_files = self.arff.read_dir(
             self.prefix + 'pickle_' + dir_name)
     elif method == '':
         self.method = 'CC'
 def __init__(self, method="", dir_name="temp", is_clean=True):
     self.prefix = "texts/"
     self.arff = Parse_ARFF()
     self.dir_name = dir_name
     if is_clean:
         self.__clean_dir(self.prefix + self.dir_name)
     self.n_folds = 5
     self.classes = [0, 1]
     self.method_prev = self._bin_prevalence  # ._bin_prevalence or ._multi_prevalence
     self.model = self.__classificator(class_weight="auto")
     if method == "EM" or method == "EM1" or method == "Iter" or method == "Iter1":
         self.method = method
     elif method == "PCC" or method == "CC" or method == "ACC" or method == "PACC":
         self.method = method
     elif method == "test":
         self.method = method
         self._train_file, self._test_files = self.arff.read_dir(self.prefix + "pickle_" + dir_name)
     elif method == "":
         self.method = "CC"
class Quantification:
    def __classificator(self, class_weight="auto"):
        if class_weight == "":
            # return SVC(kernel='rbf', probability=True)
            return linear_model.LogisticRegression()
            # return GMM(n_components=2)
        else:
            # return SVC(kernel='rbf', probability=True, class_weight = class_weight)
            return linear_model.LogisticRegression(class_weight=class_weight)
            # return GMM(n_components=2)

    def __init__(self, method="", dir_name="temp", is_clean=True):
        self.prefix = "texts/"
        self.arff = Parse_ARFF()
        self.dir_name = dir_name
        if is_clean:
            self.__clean_dir(self.prefix + self.dir_name)
        self.n_folds = 5
        self.classes = [0, 1]
        self.method_prev = self._bin_prevalence  # ._bin_prevalence or ._multi_prevalence
        self.model = self.__classificator(class_weight="auto")
        if method == "EM" or method == "EM1" or method == "Iter" or method == "Iter1":
            self.method = method
        elif method == "PCC" or method == "CC" or method == "ACC" or method == "PACC":
            self.method = method
        elif method == "test":
            self.method = method
            self._train_file, self._test_files = self.arff.read_dir(self.prefix + "pickle_" + dir_name)
        elif method == "":
            self.method = "CC"

    def fit(self, X, y):
        if isinstance(y, list):
            y = np.asarray(y)
        self.classes = np.unique(y)
        # if isinstance(y, csr_matrix):
        #    self.y_train=y.toarray()
        # elif isinstance(y, np.ndarray):
        #    if len(y.shape)==1:
        #        self.y_train=MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y])
        #    elif len(y.shape)==2:
        #        self.y_train=y
        self.y_train = y
        self.X_train = X
        self.model.fit(X, y)
        return self.model

    def predict(self, X, method=""):
        if method != "":
            self.method = method
        if self.method == "CC":
            y_pred = self.model.predict(X)
            # print('CC', y_pred)
            prevalence = self._classify_and_count(y_pred)
        elif self.method == "ACC":
            y_pred = self.model.predict(X)
            self.kfold_results = self.__kfold_tp_fp(self.X_train, self.y_train, n_folds=self.n_folds)
            prevalence = self._adj_classify_and_count(y_pred, is_prob=False)
        elif self.method == "PCC":
            prob_pred = self.model.predict_proba(X)
            prevalence = self._prob_classify_and_count(prob_pred)
        elif self.method == "PACC":
            self.kfold_results = self.__kfold_prob_tp_fp(self.X_train, self.y_train, n_folds=self.n_folds)
            prob_pred = self.model.predict_proba(X)
            prevalence = self._adj_classify_and_count(prob_pred, is_prob=True)
        elif self.method == "EM":
            prob_pred = self.model.predict_proba(X)
            prevalence = self._expectation_maximization(self.y_train, prob_pred, stop_delta=0.00001)
        elif self.method == "EM1":
            prob_pred = self.model.predict_proba(X)
            prevalence = self._exp_max(self.y_train, prob_pred, stop_delta=0.00001)
        elif self.method == "Iter":
            prevalence = self._cost_sens_learning(X, stop_delta=0.00001, class_weight_start="auto")
        elif self.method == "Iter1":
            prevalence = self._cost_sens_learning(X, stop_delta=0.00001, class_weight_start="")
        elif self.method == "test":
            self._process_pipeline()
        return prevalence

    def predict_set(self, X_list, method=""):
        scores = []
        for X in X_list:
            prev_pred = self.predict(X, method)
            scores.append(prev_pred)
        return scores

    def score(self, X_list, y_list, method=""):
        scores = []
        for X, y in zip(X_list, y_list):
            y = np.asarray(y)
            prev_pred = self.predict(X, method)
            prev_true = self._classify_and_count(y)
            # print(prev_pred, prev_true)
            # scores.append(self._divergence_bin(prev_true, prev_pred, self._kld))
            scores.append(self._emd(prev_true, prev_pred))
        return np.average(scores)

    def make_drift_rnd(X, y, proportion=0.5):
        index = {}
        for val in scipy.unique(y):
            index[val] = []
        for key in range(len(y)):
            index[y[key]].append(key)
        ind2low = []
        num2low = int(len(index) / 2)
        while ind2low == [] and num2low != 0:
            j = 0
            for i in index:
                # print(i, j, num2low)
                if j >= num2low:
                    break
                rnd = random.random()
                # print(rnd,j,i)
                if rnd < 0.5:
                    ind2low.append(i)
                    j += 1
        new_ind = index.copy()
        new_set = []
        for ind in ind2low:
            for val in index[ind]:
                rnd = random.random()
                if rnd > proportion:
                    new_set.append(val)
            new_ind[ind] = new_set
        new_y = []
        new_X = []

        for i in index:
            try:
                new_y = np.concatenate((new_y, y[new_ind[i]]))
                new_X = np.concatenate((new_X, X[new_ind[i]]), axis=0)
            except:
                new_y = y[new_ind[i]]
                new_X = X[new_ind[i]]
        return new_X, new_y

    def make_drift_05(X, y, proportion=0.5):
        # index=[]
        # for key in range(len(scipy.unique(y))):
        #    index.append(key)
        # y=MultiLabelBinarizer(classes=index).fit_transform([[y_p] for y_p in y])
        ind2low = []
        if proportion < 0.5:
            ind2low.append(0)
            proportion = proportion * 2
        else:
            ind2low = [i for i in range(1, y.shape[1])]
            proportion = (1 - proportion) * 2

        new_X = np.array([], ndmin=2)
        new_y = np.array([], ndmin=2)
        for clas in ind2low:
            for ind, num in zip(y.transpose()[clas], range(len(y.transpose()[clas]))):
                if ind > 0.5:
                    rnd = random.random()
                    if rnd < proportion:
                        if new_X != np.array([], ndmin=2):
                            # print(ind, rnd, new_y.shape,new_X.shape[0])
                            tX = np.ndarray(shape=(1, X[num].shape[0]), buffer=X[num].copy())
                            new_X = np.concatenate((new_X, tX), axis=0)
                            ty = np.ndarray(shape=(1, y[num].shape[0]), buffer=y[num].copy(), dtype=int)
                            new_y = np.concatenate((new_y, ty), axis=0)
                        else:
                            new_X = np.ndarray(shape=(1, X[num].shape[0]), buffer=X[num].copy())
                            new_y = np.ndarray(shape=(1, y[num].shape[0]), buffer=y[num].copy(), dtype=int)
                else:
                    if new_X != np.array([], ndmin=2):
                        tX = np.ndarray(shape=(1, X[num].shape[0]), buffer=X[num].copy())
                        new_X = np.concatenate((new_X, tX), axis=0)
                        ty = np.ndarray(shape=(1, y[num].shape[0]), buffer=y[num].copy(), dtype=int)
                        new_y = np.concatenate((new_y, ty), axis=0)
                    else:
                        new_X = np.ndarray(shape=(1, X[num].shape[0]), buffer=X[num].copy())
                        new_y = np.ndarray(shape=(1, y[num].shape[0]), buffer=y[num].copy(), dtype=int)
        return new_X, new_y

    def make_drift_list(X, y, proportion=0.5):
        # index=[]
        # for key in range(len(scipy.unique(y))):
        #    index.append(key)
        # y=MultiLabelBinarizer(classes=index).fit_transform([[y_p] for y_p in y])
        ind_set = scipy.unique(y)
        if proportion < 0.5:
            ind2low = set([0])
            proportion = proportion * 2
        else:
            ind2low = set([i for i in range(1, len(ind_set))])
            proportion = (1 - proportion) * 2

        new_X = np.array([], ndmin=2)
        new_y = []
        for clas in ind_set:
            for ind, num in zip(y, range(len(y))):
                if ind in ind2low:
                    rnd = random.random()
                    if rnd < proportion:
                        if new_X != np.array([], ndmin=2):
                            tX = np.ndarray(shape=(1, X[num].shape[0]), buffer=X[num].copy())
                            new_X = np.concatenate((new_X, tX), axis=0)
                            new_y.append(ind)
                        else:
                            new_X = np.ndarray(shape=(1, X[num].shape[0]), buffer=X[num].copy())
                            new_y.append(ind)
                else:
                    if new_X != np.array([], ndmin=2):
                        tX = np.ndarray(shape=(1, X[num].shape[0]), buffer=X[num].copy())
                        new_X = np.concatenate((new_X, tX), axis=0)
                        new_y.append(ind)
                    else:
                        new_X = np.ndarray(shape=(1, X[num].shape[0]), buffer=X[num].copy())
                        new_y.append(ind)
        return new_X, new_y

    def _kld(self, p, q):
        """Kullback-Leibler divergence D(P || Q) for discrete distributions when Q is used to approximate P
        Parameters
        p, q : array-like, dtype=float, shape=n
        Discrete probability distributions."""
        p = np.asarray(p, dtype=np.float)
        q = np.asarray(q, dtype=np.float)
        return np.sum(np.where(p != 0, p * np.log(p / q), 0))

    def _rae(self, p, q):
        p = np.asarray(p, dtype=np.float)
        q = np.asarray(q, dtype=np.float)
        return np.sum(np.where(p != 0, np.abs(q - p) / p, 0))

    def _ae(self, p, q):
        # Absolute error
        p = np.asarray(p, dtype=np.float)
        q = np.asarray(q, dtype=np.float)
        return np.average(np.abs(q - p))

    def _emd(self, p, q):
        # Earth Mover’s Distance (Rubner et al., 2000)
        p = np.asarray(p, dtype=np.float)
        q = np.asarray(q, dtype=np.float)
        emd = 0
        for i in range(1, len(p)):
            # emd+=np.abs(np.sum(q[0:i])-np.sum(p[0:i]))
            emd += np.sum(np.abs(q[0:i] - p[0:i]))
        return emd

    def _divergence_bin(self, p, q, func=""):
        if func == "":
            func = self._kld
        p = np.asarray(p, dtype=np.float)
        q = np.asarray(q, dtype=np.float)
        # print(p,q)
        klds = []
        for p_i, q_i in zip(p, q):
            klds.append(func([p_i, 1 - p_i], [q_i, 1 - q_i]))
        # print(len(_klds))
        # _avg=np.average(_klds)
        return klds  # _avg

    def _multi_prevalence(self, y):
        prevalence = []
        prevalence_smooth = []
        eps = 1 / (2 * y.shape[0])
        if isinstance(y, csr_matrix):
            for _col in range(y.shape[1]):
                prevalence.append(y.getcol(_col).nnz)
            prevalence = prevalence / (np.sum(prevalence))
            for _val in prevalence:  # perform smoothing
                prevalence_smooth.append(_val + eps)
            prevalence_smooth = prevalence_smooth / (np.sum(prevalence) + eps * y.shape[1])
        elif isinstance(y, np.ndarray):
            if len(y.shape) == 1:
                yt = MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y]).transpose()
                for col in yt:
                    prevalence.append(np.sum(col))
                prevalence = prevalence / (np.sum(prevalence))
                for val in prevalence:  # perform smoothing
                    prevalence_smooth.append(val + eps)
                prevalence_smooth = prevalence_smooth / (np.sum(prevalence) + eps * yt.shape[0])
            elif len(y.shape) == 2:
                yt = y.transpose()
                for col in yt:
                    prevalence.append(np.sum(col))
                prevalence = prevalence / (np.sum(prevalence))
                for val in prevalence:  # perform smoothing
                    prevalence_smooth.append(val + eps)
                prevalence_smooth = prevalence_smooth / (np.sum(prevalence) + eps * yt.shape[0])
        return prevalence_smooth

    def _bin_prevalence(self, y):
        prevalence = []
        if isinstance(y, csr_matrix):
            eps = 1 / (2 * y.shape[0])
            for col in range(y.shape[1]):
                prevalence.append((y.getcol(col).nnz + eps) / (eps * y.shape[1] + y.shape[0]))
            prevalence = np.asarray(prevalence, dtype=np.float)
        elif isinstance(y, list):
            eps = 1 / (2 * len(y))
            yt = MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y]).transpose()
            for col in range(yt.shape[0]):
                prevalence.append((np.sum(yt[col]) + eps) / (eps * yt.shape[0] + yt.shape[1]))
            prevalence = np.asarray(prevalence, dtype=np.float)
        elif isinstance(y, np.ndarray):
            eps = 1 / (2 * y.shape[0])
            if len(y.shape) == 1:
                # print(self.classes, 'Variable "y" should have more then 1 dimension. Use MultiLabelBinarizer()')
                yt = MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y]).transpose()
            elif len(y.shape) == 2:
                yt = y.transpose()
            for col in range(yt.shape[0]):
                prevalence.append((np.sum(yt[col]) + eps) / (eps * yt.shape[0] + yt.shape[1]))
            prevalence = np.asarray(prevalence, dtype=np.float)
        return prevalence

    def _bin_prevalence_prob(self, y):
        y = np.asarray(y, dtype=np.float).T
        eps = 1 / (2 * y.shape[1])
        prevalence = []
        print(self.model.intercept_[0][0], self.model.coef_)
        for col in y:
            nnz = 0
            for elem in col:
                if elem >= self.model.intercept_:
                    nnz += 1
            prevalence.append((nnz + eps) / (eps * y.shape[0] + y.shape[1]))
        return prevalence

    def __clean_dir(self, dir):
        for name in os.listdir(dir):
            file = os.path.join(dir, name)
            if not os.path.islink(file) and not os.path.isdir(file):
                os.remove(file)

    def __split_by_prevalence(self):
        [csr, y, y_names] = self._read_pickle(self._train_file)
        _prevalence = self.method_prev(y)
        ly = y.shape[1]
        _VLP = []
        _LP = []
        _HP = []
        _VHP = []
        _col = 0
        for _val in _prevalence:
            if _val < 0.01:
                for _i in range(4):
                    _VLP.append(_col + ly * _i)
            elif _val >= 0.01 and _val < 0.05:
                for _i in range(4):
                    _LP.append(_col + ly * _i)
            elif _val >= 0.05 and _val < 0.1:
                for _i in range(4):
                    _HP.append(_col + ly * _i)
            elif _val >= 0.1:
                for _i in range(4):
                    _VHP.append(_col + ly * _i)
            _col += 1
        return [0, _VLP, _LP, _HP, _VHP]

    def __split_by_distribution_drift(self):  # pickle_QuantRCV1
        [csr, y, y_names] = self._read_pickle(self._train_file)
        pr_train = self.method_prev(y)
        _arrange = []
        j = 0
        for _test_file in self._test_files:
            [csr1, y1, y1_names] = self._read_pickle(_test_file)
            pr_test = self.method_prev(y1)
            # _arrange.append((_j,self.kld_bin(pr_test, pr_train)))
            for _i in range(len(pr_train)):
                _arrange.append((j, self._kld([pr_test[_i], 1 - pr_test[_i]], [pr_train[_i], 1 - pr_train[_i]])))
                j = j + 1
        _arrange_sorted = sorted(_arrange, key=operator.itemgetter(1))
        _VLD = [_x[0] for _x in _arrange_sorted[: len(y_names)]]
        _LD = [_x[0] for _x in _arrange_sorted[len(y_names) : 2 * len(y_names)]]
        _HD = [_x[0] for _x in _arrange_sorted[2 * len(y_names) : 3 * len(y_names)]]
        _VHD = [_x[0] for _x in _arrange_sorted[3 * len(y_names) :]]
        return [_arrange, _VLD, _LD, _HD, _VHD]

    def _read_pickle(self, file):
        print("Read file " + file)
        with open(file, "rb") as f:
            data = pickle.load(f)
            f.close()
        return data

    def _estimate_cl_indexes(self):  # pickle_QuantRCV1
        # [_csr, _y, y_names]=self._read_pickle(self._train_file)
        # _prev_train=self.count_prevalence(_y)
        # model=self.arff.fit(_csr,_y)
        _pr_list = []
        _y1_list = []
        for _test_file in self._test_files:
            [_csr1, _y1, _y1_names] = self._read_pickle(_test_file)
            _y1_list.append(_y1)
            _pr_list.append(self.model.predict(_csr1))
        with open(self.prefix + "cl_indexes_" + self.dir_name + ".pickle", "wb") as f:
            print(self.prefix + "cl_indexes_" + self.dir_name + ".pickle")
            pickle.dump([_y, _y1_list, _pr_list, _test_files, y_names], f)
            f.close()
        names_ = [_y, _y1_list, _pr_list, _test_files, y_names]
        return names_

    def __subset(self, _inp_set, _indexes):
        _sub_set = []
        for _i in _indexes:
            _sub_set.append(_inp_set[_i])
        # _sub_set=_sub_set/np.sum(_sub_set)
        return _sub_set

    def __count_splited_KLD(self, _part, _prev_test, _prev_test_estimate):
        split_by = [
            np.average(
                self._divergence_bin(self.__subset(_prev_test, _part[1]), self.__subset(_prev_test_estimate, _part[1]))
            ),
            np.average(
                self._divergence_bin(self.__subset(_prev_test, _part[2]), self.__subset(_prev_test_estimate, _part[2]))
            ),
            np.average(
                self._divergence_bin(self.__subset(_prev_test, _part[3]), self.__subset(_prev_test_estimate, _part[3]))
            ),
            np.average(
                self._divergence_bin(self.__subset(_prev_test, _part[4]), self.__subset(_prev_test_estimate, _part[4]))
            ),
            np.average(self._divergence_bin(_prev_test, _prev_test_estimate)),
        ]
        return split_by

    def __count_ttest(self, _prev_test, _prev_test_estimate1, _prev_test_estimate2):
        _kld_1 = self._divergence_bin(_prev_test, _prev_test_estimate1)
        _kld_2 = self._divergence_bin(_prev_test, _prev_test_estimate2)
        tt = stats.ttest_rel(_kld_1, _kld_2)
        return tt

    def _classify_and_count(self, _y_test):
        # _prev_test=[]
        # for _y_test in y_list:# Test files loop
        #    if is_prob:
        #        _prev_test=np.concatenate((_prev_test,self.method_prev(_y_test)), axis=1)
        #    else:
        #        _prev_test=np.concatenate((_prev_test,self.method_prev(_y_test)), axis=1)
        _prev_test = self.method_prev(_y_test)
        return _prev_test

    def _count_diff1(self, _prev_test, _prev_test_estimate, _num_iter):
        _parts_P = self.__split_by_prevalence()
        _parts_D = self.__split_by_distribution_drift()
        kld_bin = self._divergence_bin(_prev_test, _prev_test_estimate)
        print("\t\t\t VLP \t\t\t LP \t\t\t HP \t\t\t VHP \t\t\t total")
        print(
            np.average(self.__subset(kld_bin, _parts_P[1])),
            np.average(self.__subset(kld_bin, _parts_P[2])),
            np.average(self.__subset(kld_bin, _parts_P[3])),
            np.average(self.__subset(kld_bin, _parts_P[4])),
            np.average(kld_bin),
        )
        print("\t\t\t VLD \t\t\t LD \t\t\t HD \t\t\t VHD \t\t\t total")
        print(
            np.average(self.__subset(kld_bin, _parts_D[1])),
            np.average(self.__subset(kld_bin, _parts_D[2])),
            np.average(self.__subset(kld_bin, _parts_D[3])),
            np.average(self.__subset(kld_bin, _parts_D[4])),
            np.average(kld_bin),
        )
        print("\t\t\t VLP \t\t\t LP \t\t\t HP \t\t\t VHP \t\t\t total")
        print(
            np.average(self.__subset(_num_iter, _parts_P[1])),
            np.average(self.__subset(_num_iter, _parts_P[2])),
            np.average(self.__subset(_num_iter, _parts_P[3])),
            np.average(self.__subset(_num_iter, _parts_P[4])),
            np.average(_num_iter),
        )
        print("\t\t\t VLD \t\t\t LD \t\t\t HD \t\t\t VHD \t\t\t total")
        print(
            np.average(self.__subset(_num_iter, _parts_D[1])),
            np.average(self.__subset(_num_iter, _parts_D[2])),
            np.average(self.__subset(_num_iter, _parts_D[3])),
            np.average(self.__subset(_num_iter, _parts_D[4])),
            np.average(_num_iter),
        )
        return 0

    def _count_diff(self, _prev_test, _prev_test_estimate):
        _parts_D = self.__split_by_distribution_drift()
        _parts_P = self.__split_by_prevalence()
        # print(len(_parts_P[1]),len(_parts_P[2]),len(_parts_P[3]),len(_parts_P[4]))
        _kld_P = self.__count_splited_KLD(_parts_P, _prev_test, _prev_test_estimate)
        print("\t\t\t\t VLP \t\t\t\t LP \t\t\t\t HP \t\t\t\t VHP \t\t\t\t total \n", _kld_P)
        _kld_D = self.__count_splited_KLD(_parts_D, _prev_test, _prev_test_estimate)
        print("\t\t\t\t VLD \t\t\t\t LD \t\t\t\t HD \t\t\t\t VHD \t\t\t\t total \n", _kld_D)
        return _kld_P[4]

    def _unite_cl_prob(self):
        # read probabilities from separate files and aggregate it to one file
        [_csr, _y, y_names] = self._read_pickle(self._train_file)
        _train_file, _test_files = self.arff.read_dir(self.prefix + "cl_prob_" + self.dir_name)
        _prob_list = []
        for _test_file in _test_files:
            with open(_test_file, "rb") as f:
                _prob = pickle.load(f)
                f.close()
            _prob_list.append(_prob)
        _y1_list = []
        for _test_file1 in self._test_files:
            [_csr1, _y1, _y1_names] = self._read_pickle(_test_file1)
            _y1_list.append(_y1)
        with open("texts/cl_prob_" + self.dir_name + ".pickle", "wb") as f:
            pickle.dump([_y, _y1_list, _prob_list, self._test_files, _y1_names], f)
            f.close()
        return [_y, _y1_list, _prob_list, self._test_files, _y1_names]

    def _estimate_cl_prob(self):
        try:
            with open("texts/ml_model_" + self.dir_name + ".pickle", "rb") as f:
                self.model = pickle.load(f)
                f.close()
        except:
            [_csr, _y, y_names] = self._read_pickle(self._train_file)
            _prev_train = self.count_prevalence(_y)
            model = self.model  # self.arff.fit(_csr,_y)
            with open("texts/ml_model_" + self.dir_name + ".pickle", "wb") as f:
                pickle.dump(model, f)
                f.close()

        _prob_list = []
        _y1_list = []
        for _t in range(len(self._test_files)):  # range(42,52):
            _test_file = self._test_files[_t]
            [_csr1, _y1, _y1_names] = self._read_pickle(_test_file)
            _y1_list.append(_y1)
            _prob = model.predict_proba(_csr1)
            _prob_list.append(_prob)
            with open(
                "texts/cl_prob_" + _test_file.rstrip(".arff.pickle").lstrip("texts/pickle_") + ".cl_prob", "wb"
            ) as f:
                pickle.dump(_prob, f)
                f.close()
        with open("texts/cl_prob_" + self.dir_name + ".pickle", "wb") as f:
            pickle.dump([_y, _y1_list, _prob_list, self._test_files, _y1_names], f)
            f.close()
        return [_y, _y1_list, _prob_list, self._test_files, y_names]

    def _prob_classify_and_count(self, pred_prob):
        # avr_prob=[]
        # for pred_prob in pred_prob_list:
        #    avr_prob=np.concatenate((avr_prob,np.average(pred_prob, axis=0)))
        # print('PCC',avr_prob)
        return np.average(pred_prob, axis=0)

    def _exp_max(self, y_train, pred_prob, stop_delta=0.1):
        pr_train = self._bin_prevalence(y_train)
        pr_all = []
        pr_s = pr_train.copy()
        prob_t = pred_prob.T
        prob_t_s = prob_t.copy()
        delta = 1
        delta_s = 1
        count = 0
        while delta > stop_delta and delta <= delta_s and count < 100:
            for cl_n in range(len(pr_train)):  # Category
                prob_t_s[cl_n] = prob_t[cl_n].copy() * (pr_s[cl_n] / pr_train[cl_n])  # E step
            prob_t_s = normalize(prob_t_s, norm="l1", axis=0)  # E step
            pr_s1 = np.average(prob_t_s, axis=1)  # M step
            # pr_s1=self._adj_classify_and_count([prob_t_s.transpose()],is_prob=True)
            delta_s = delta
            # delta=np.max(np.abs(pr_s1-pr_s))
            delta = self._ae(pr_s, pr_s1)
            # print('pr_s1',pr_s1, delta)
            # print(prob_t_s)
            # pr_train=pr_s.copy()
            # prob_t=prob_t_s.copy()
            pr_s = pr_s1.copy()
            count = count + 1
        if np.max(pr_s) > 0.99:
            pr_s = np.average(prob_t, axis=1)
        return pr_s

    def _expectation_maximization(self, y_train, pred_prob, stop_delta=0.1):  # _indexes
        # [y_train, y_test_list, pred_prob_list, test_files, y_names]=_indexes
        # print(pred_prob_list[0][1])
        pr_train = self._bin_prevalence(y_train)
        pr_all = []
        num_iter = []
        test_num = 0  # 0..3 len(_y_test_list)
        pr_c = pr_train.copy()

        prob = pred_prob.T
        for cl_n in range(len(pr_train)):  # Category
            # print('Test set N %s, class number %s' %(test_num, cl_n))
            iter = 0
            _delta = 1
            while _delta > stop_delta:
                pr_c_x = []
                _j = 0
                for pr_c_xk in prob[cl_n]:  # xk in category c
                    # Step E
                    pr_c_x_k = (pr_c[cl_n] / pr_train[cl_n] * pr_c_xk) / (
                        ((1 - pr_c[cl_n]) / (1 - pr_train[cl_n])) * (1 - pr_c_xk)
                        + pr_c[cl_n] / pr_train[cl_n] * pr_c_xk
                    )
                    pr_c_x.append(pr_c_x_k)
                    _j += 1
                # Step M
                pr_c_new = np.average(pr_c_x)  # np.average(_prob[cl_n])
                _delta = np.abs(pr_c_new - pr_c[cl_n])
                # print('_delta',_delta)
                # pr_train[cl_n]=pr_c[cl_n]
                # prob[cl_n]=pr_c_x_k
                pr_c[cl_n] = pr_c_new
                iter += 1
            num_iter.append(iter)
            if np.max([pr_c[cl_n], 1 - pr_c[cl_n]]) > 0.99:
                pr_c[cl_n] = np.average(prob[cl_n])
        return pr_c  # ,num_iter

    def _cost_sens_learning(self, X_test, stop_delta=0.00001, class_weight_start="auto"):
        pred_prev_train = self._classify_and_count(self.y_train)
        pred_prev0 = pred_prev_train.copy()
        model = self.__classificator(class_weight=class_weight_start)  # class_weight={0:1,1:1})##
        model.fit(self.X_train, self.y_train)
        pred_prev1 = np.average(model.predict_proba(X_test), axis=0)  #
        # pred_prev1=self._classify_and_count(model.predict(X_test))
        delta1 = 0
        delta2 = 0
        d_delta1 = 0
        d_delta2 = 0
        for i in range(10):
            # print('pred_prev0',pred_prev0)
            # print('pred_prev1',pred_prev1)
            # print(pred_prev1/pred_prev_train)
            # print(delta2)
            class_weight = dict(zip(self.classes, pred_prev1 / pred_prev_train))

            model = self.__classificator(class_weight=class_weight)
            model.fit(self.X_train, self.y_train)
            pred_prev2 = np.average(model.predict_proba(X_test), axis=0)  #
            # pred_prev2=self._classify_and_count(model.predict(X_test))#
            delta1 = delta2
            delta2 = self._ae(pred_prev1, pred_prev2)
            d_delta3 = abs(delta2 - delta1)
            if delta2 < stop_delta or d_delta3 > d_delta2 and d_delta2 > d_delta1 and d_delta1 != 0:
                # print('dd',d_delta1, d_delta2,d_delta3)
                self.iter_model = model
                break
            d_delta1 = d_delta2
            d_delta2 = d_delta3
            # print(pred_prev2[0],'\t', delta1)
            # if delta2<stop_delta:
            #    break
            pred_prev0 = pred_prev1.copy()
            pred_prev1 = pred_prev2.copy()
        # print('pred_prev1',pred_prev1)
        self.iter_model = model
        return pred_prev1

    def __conditional_probability(self, p1, p2, val1, val2):
        c = 0
        for _i in range(len(p1)):
            if p1[_i] == val1 and p2[_i] == val2:
                c = c + 1
        return c / len(p1)

    def __kfold_tp_fp(self, X, y, n_folds=2):
        # return true positive rate and false positive rate arrays

        # if isinstance(X, csr_matrix) and isinstance(y, csr_matrix):
        #    X=X.toarray()
        #    y=y.toarray()
        # elif isinstance(X, csr_matrix) and isinstance(y, np.ndarray):
        #    X=X.toarray()
        #    y=MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y])
        # elif isinstance(X, np.ndarray) and isinstance(y, np.ndarray):
        #    if len(y.shape)==1:
        #        y=MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y])
        #    elif len(y.shape)==2:
        #        pass
        if isinstance(y, list):
            y = np.asarray(y)

        try:
            with open(self.prefix + self.dir_name + "/" + str(n_folds) + "FCV.pickle", "rb") as f:
                [tp_av, fp_av] = pickle.load(f)
        except:
            _kf = KFold(y.shape[0], n_folds=n_folds)
            tp = []
            fp = []
            for train_index, test_index in _kf:
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                model = self.model
                model = model.fit(X_train, y_train)  # arff.fit(X_train, y_train)
                y_predict = model.predict(X_test)
                tp_k = []
                fp_k = []
                if len(y.shape) == 1:
                    y_test = MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y_test])
                    y_predict = MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y_predict])
                elif len(y.shape) == 2:
                    pass
                for s_true, s_pred in zip(y_test.T, y_predict.T):
                    tp_k.append(self.__conditional_probability(s_pred, s_true, 1.0, 1.0))  # cm[0,0]/len(s_true))
                    fp_k.append(
                        self.__conditional_probability(s_pred, s_true, 1.0, 0.0)
                    )  # cm[1,0]/len(s_true))#len(s_true))
                tp.append(tp_k)
                fp.append(fp_k)
            tp_av = np.asarray([np.average(tp_k) for tp_k in np.asarray(tp).T])
            fp_av = np.asarray([np.average(fp_k) for fp_k in np.asarray(fp).T])
            with open(self.prefix + self.dir_name + "/" + str(n_folds) + "FCV.pickle", "wb") as f:
                pickle.dump([tp_av, fp_av], f)
                f.close()
            # print('[tp_av, fp_av] by index',tp_av, fp_av)
        return [tp_av, fp_av]

    def __kfold_prob_tp_fp(self, X, y, n_folds=2):
        # if isinstance(X, csr_matrix) and isinstance(y, np.ndarray):
        #     X=X.toarray()
        # elif isinstance(X, np.ndarray) and isinstance(y, np.ndarray):
        #     if len(y.shape)==1:
        #         y=MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y])
        #     elif len(y.shape)==2:
        #         pass
        if isinstance(y, list):
            y = np.asarray(y)

        try:
            with open(self.prefix + self.dir_name + "/" + str(n_folds) + "FCV_prob.pickle", "rb") as f:
                [tp_av, fp_av] = pickle.load(f)
        except:
            kf = KFold(y.shape[0], n_folds=n_folds)
            TP_avr = []
            FP_avr = []
            for train_index, test_index in kf:
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                model = self.model
                model = model.fit(X_train, y_train)
                y_predict = model.predict(X_test)
                y_prob_predict = model.predict_proba(X_test)
                TP = []
                FP = []
                if len(y.shape) == 1:
                    y_predict = MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y_predict])
                elif len(y.shape) == 2:
                    pass
                for class_ind, class_prob in zip(y_predict.transpose(), y_prob_predict.transpose()):
                    TP_class = []
                    FP_class = []
                    for ind, prob in zip(class_ind, class_prob):
                        if ind == 1:
                            TP_class.append(prob)
                        elif ind == 0:
                            FP_class.append(prob)
                    TP.append(np.sum(TP_class) / len(class_ind))
                    FP.append(np.sum(FP_class) / len(class_ind))
                TP_avr.append(TP)
                FP_avr.append(FP)
            tp_av, fp_av = np.average(TP_avr, axis=0), np.average(FP_avr, axis=0)
            with open(self.prefix + self.dir_name + "/" + str(n_folds) + "FCV_prob.pickle", "wb") as f:
                pickle.dump([tp_av, fp_av], f)
                f.close()
            # print('tp, fp by prob', tp_av, fp_av)
        return [tp_av, fp_av]

    def _adj_classify_and_count(self, y_pred, is_prob=False):
        [tp_av, fp_av] = self.kfold_results

        if is_prob:
            pr = np.average(y_pred, axis=0)
        else:
            pr = self.method_prev(y_pred)
        try:
            pred = (pr - fp_av) / (tp_av - fp_av)
            if np.min(pred) >= 0:
                pred = normalize(pred, norm="l1", axis=1)[0]
            else:
                # print(pred)
                # print(pr,tp_av,fp_av)
                pred = pr
        except:
            print(pr, tp_av, fp_av)
            pred = pr
        return pred

    def _process_pipeline(self):
        # Warning! Processing can takes a long period. We recommend to perform it step by step
        # pa=Parse_ARFF()
        # pa.convert_arff(QuantOHSUMED, is_predict=False)
        # q=Quantification('QuantOHSUMED')
        # q.process_pipeline()
        #####################################################
        [self.X_train, self.y_train, y_names] = self._read_pickle(self._train_file)
        self.fit(self.X_train, self.y_train)
        # [y_train, y_test_list, y_pred_list, test_files, y_names]=self._estimate_cl_indexes()
        [y_train, y_test_list, y_pred_list, test_files, y_names] = self._read_pickle(
            "texts/cl_indexes_" + self.dir_name + ".pickle"
        )
        td = self._classify_and_count(y_test_list)
        ed1 = self._classify_and_count(y_pred_list)

        ed2 = self._adj_classify_and_count(self.X_train, self.y_train, y_pred_list)

        self._estimate_cl_prob()
        self._unite_cl_prob()
        [y_train, y_test_list, pred_prob_list, test_files, y_names] = self._read_pickle(
            "texts/cl_prob_" + self.dir_name + ".pickle"
        )
        ed3 = self._classify_and_count(pred_prob_list, is_prob=True)
        ed4 = self._prob_classify_and_count(pred_prob_list)
        ed5, num_iter = self._expectation_maximization(self.y_train, pred_prob_list, 0.1)
        self._count_diff(td, ed4)
        self._count_diff1(td, ed5, num_iter)
Beispiel #4
0
class Quantification:
    def __classificator(self, class_weight='auto'):
        if class_weight == '':
            #return SVC(kernel='rbf', probability=True)
            return linear_model.LogisticRegression()
            #return GMM(n_components=2)
        else:
            #return SVC(kernel='rbf', probability=True, class_weight = class_weight)
            return linear_model.LogisticRegression(class_weight=class_weight)
            #return GMM(n_components=2)

    def __init__(self, method='', dir_name='temp', is_clean=True):
        self.prefix = 'texts/'
        self.arff = Parse_ARFF()
        self.dir_name = dir_name
        if is_clean: self.__clean_dir(self.prefix + self.dir_name)
        self.n_folds = 5
        self.classes = [0, 1]
        self.method_prev = self._bin_prevalence  #._bin_prevalence or ._multi_prevalence
        self.model = self.__classificator(class_weight='auto')
        if method == 'EM' or method == 'EM1' or method == 'Iter' or method == 'Iter1':
            self.method = method
        elif method == 'PCC' or method == 'CC' or method == 'ACC' or method == 'PACC':
            self.method = method
        elif method == 'test':
            self.method = method
            self._train_file, self._test_files = self.arff.read_dir(
                self.prefix + 'pickle_' + dir_name)
        elif method == '':
            self.method = 'CC'

    def fit(self, X, y):
        if isinstance(y, list): y = np.asarray(y)
        self.classes = np.unique(y)
        #if isinstance(y, csr_matrix):
        #    self.y_train=y.toarray()
        #elif isinstance(y, np.ndarray):
        #    if len(y.shape)==1:
        #        self.y_train=MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y])
        #    elif len(y.shape)==2:
        #        self.y_train=y
        self.y_train = y
        self.X_train = X
        self.model.fit(X, y)
        return self.model

    def predict(self, X, method=''):
        if method != '':
            self.method = method
        if self.method == 'CC':
            y_pred = self.model.predict(X)
            #print('CC', y_pred)
            prevalence = self._classify_and_count(y_pred)
        elif self.method == 'ACC':
            y_pred = self.model.predict(X)
            self.kfold_results = self.__kfold_tp_fp(self.X_train,
                                                    self.y_train,
                                                    n_folds=self.n_folds)
            prevalence = self._adj_classify_and_count(y_pred, is_prob=False)
        elif self.method == 'PCC':
            prob_pred = self.model.predict_proba(X)
            prevalence = self._prob_classify_and_count(prob_pred)
        elif self.method == 'PACC':
            self.kfold_results = self.__kfold_prob_tp_fp(self.X_train,
                                                         self.y_train,
                                                         n_folds=self.n_folds)
            prob_pred = self.model.predict_proba(X)
            prevalence = self._adj_classify_and_count(prob_pred, is_prob=True)
        elif self.method == 'EM':
            prob_pred = self.model.predict_proba(X)
            prevalence = self._expectation_maximization(self.y_train,
                                                        prob_pred,
                                                        stop_delta=0.00001)
        elif self.method == 'EM1':
            prob_pred = self.model.predict_proba(X)
            prevalence = self._exp_max(self.y_train,
                                       prob_pred,
                                       stop_delta=0.00001)
        elif self.method == 'Iter':
            prevalence = self._cost_sens_learning(X,
                                                  stop_delta=0.00001,
                                                  class_weight_start='auto')
        elif self.method == 'Iter1':
            prevalence = self._cost_sens_learning(X,
                                                  stop_delta=0.00001,
                                                  class_weight_start='')
        elif self.method == 'test':
            self._process_pipeline()
        return prevalence

    def predict_set(self, X_list, method=''):
        scores = []
        for X in X_list:
            prev_pred = self.predict(X, method)
            scores.append(prev_pred)
        return scores

    def score(self, X_list, y_list, method=''):
        scores = []
        for X, y in zip(X_list, y_list):
            y = np.asarray(y)
            prev_pred = self.predict(X, method)
            prev_true = self._classify_and_count(y)
            #print(prev_pred, prev_true)
            #scores.append(self._divergence_bin(prev_true, prev_pred, self._kld))
            scores.append(self._emd(prev_true, prev_pred))
        return np.average(scores)

    def make_drift_rnd(X, y, proportion=0.5):
        index = {}
        for val in scipy.unique(y):
            index[val] = []
        for key in range(len(y)):
            index[y[key]].append(key)
        ind2low = []
        num2low = int(len(index) / 2)
        while ind2low == [] and num2low != 0:
            j = 0
            for i in index:
                #print(i, j, num2low)
                if j >= num2low:
                    break
                rnd = random.random()
                #print(rnd,j,i)
                if rnd < 0.5:
                    ind2low.append(i)
                    j += 1
        new_ind = index.copy()
        new_set = []
        for ind in ind2low:
            for val in index[ind]:
                rnd = random.random()
                if rnd > proportion:
                    new_set.append(val)
            new_ind[ind] = new_set
        new_y = []
        new_X = []

        for i in index:
            try:
                new_y = np.concatenate((new_y, y[new_ind[i]]))
                new_X = np.concatenate((new_X, X[new_ind[i]]), axis=0)
            except:
                new_y = y[new_ind[i]]
                new_X = X[new_ind[i]]
        return new_X, new_y

    def make_drift_05(X, y, proportion=0.5):
        #index=[]
        #for key in range(len(scipy.unique(y))):
        #    index.append(key)
        #y=MultiLabelBinarizer(classes=index).fit_transform([[y_p] for y_p in y])
        ind2low = []
        if proportion < 0.5:
            ind2low.append(0)
            proportion = proportion * 2
        else:
            ind2low = [i for i in range(1, y.shape[1])]
            proportion = (1 - proportion) * 2

        new_X = np.array([], ndmin=2)
        new_y = np.array([], ndmin=2)
        for clas in ind2low:
            for ind, num in zip(y.transpose()[clas],
                                range(len(y.transpose()[clas]))):
                if ind > 0.5:
                    rnd = random.random()
                    if rnd < proportion:
                        if new_X != np.array([], ndmin=2):
                            #print(ind, rnd, new_y.shape,new_X.shape[0])
                            tX = np.ndarray(shape=(1, X[num].shape[0]),
                                            buffer=X[num].copy())
                            new_X = np.concatenate((new_X, tX), axis=0)
                            ty = np.ndarray(shape=(1, y[num].shape[0]),
                                            buffer=y[num].copy(),
                                            dtype=int)
                            new_y = np.concatenate((new_y, ty), axis=0)
                        else:
                            new_X = np.ndarray(shape=(1, X[num].shape[0]),
                                               buffer=X[num].copy())
                            new_y = np.ndarray(shape=(1, y[num].shape[0]),
                                               buffer=y[num].copy(),
                                               dtype=int)
                else:
                    if new_X != np.array([], ndmin=2):
                        tX = np.ndarray(shape=(1, X[num].shape[0]),
                                        buffer=X[num].copy())
                        new_X = np.concatenate((new_X, tX), axis=0)
                        ty = np.ndarray(shape=(1, y[num].shape[0]),
                                        buffer=y[num].copy(),
                                        dtype=int)
                        new_y = np.concatenate((new_y, ty), axis=0)
                    else:
                        new_X = np.ndarray(shape=(1, X[num].shape[0]),
                                           buffer=X[num].copy())
                        new_y = np.ndarray(shape=(1, y[num].shape[0]),
                                           buffer=y[num].copy(),
                                           dtype=int)
        return new_X, new_y

    def make_drift_list(X, y, proportion=0.5):
        #index=[]
        #for key in range(len(scipy.unique(y))):
        #    index.append(key)
        #y=MultiLabelBinarizer(classes=index).fit_transform([[y_p] for y_p in y])
        ind_set = scipy.unique(y)
        if proportion < 0.5:
            ind2low = set([0])
            proportion = proportion * 2
        else:
            ind2low = set([i for i in range(1, len(ind_set))])
            proportion = (1 - proportion) * 2

        new_X = np.array([], ndmin=2)
        new_y = []
        for clas in ind_set:
            for ind, num in zip(y, range(len(y))):
                if ind in ind2low:
                    rnd = random.random()
                    if rnd < proportion:
                        if new_X != np.array([], ndmin=2):
                            tX = np.ndarray(shape=(1, X[num].shape[0]),
                                            buffer=X[num].copy())
                            new_X = np.concatenate((new_X, tX), axis=0)
                            new_y.append(ind)
                        else:
                            new_X = np.ndarray(shape=(1, X[num].shape[0]),
                                               buffer=X[num].copy())
                            new_y.append(ind)
                else:
                    if new_X != np.array([], ndmin=2):
                        tX = np.ndarray(shape=(1, X[num].shape[0]),
                                        buffer=X[num].copy())
                        new_X = np.concatenate((new_X, tX), axis=0)
                        new_y.append(ind)
                    else:
                        new_X = np.ndarray(shape=(1, X[num].shape[0]),
                                           buffer=X[num].copy())
                        new_y.append(ind)
        return new_X, new_y

    def _kld(self, p, q):
        """Kullback-Leibler divergence D(P || Q) for discrete distributions when Q is used to approximate P
        Parameters
        p, q : array-like, dtype=float, shape=n
        Discrete probability distributions."""
        p = np.asarray(p, dtype=np.float)
        q = np.asarray(q, dtype=np.float)
        return np.sum(np.where(p != 0, p * np.log(p / q), 0))

    def _rae(self, p, q):
        p = np.asarray(p, dtype=np.float)
        q = np.asarray(q, dtype=np.float)
        return np.sum(np.where(p != 0, np.abs(q - p) / p, 0))

    def _ae(self, p, q):
        #Absolute error
        p = np.asarray(p, dtype=np.float)
        q = np.asarray(q, dtype=np.float)
        return np.average(np.abs(q - p))

    def _emd(self, p, q):
        #Earth Mover’s Distance (Rubner et al., 2000)
        p = np.asarray(p, dtype=np.float)
        q = np.asarray(q, dtype=np.float)
        emd = 0
        for i in range(1, len(p)):
            #emd+=np.abs(np.sum(q[0:i])-np.sum(p[0:i]))
            emd += np.sum(np.abs(q[0:i] - p[0:i]))
        return emd

    def _divergence_bin(self, p, q, func=''):
        if func == '': func = self._kld
        p = np.asarray(p, dtype=np.float)
        q = np.asarray(q, dtype=np.float)
        #print(p,q)
        klds = []
        for p_i, q_i in zip(p, q):
            klds.append(func([p_i, 1 - p_i], [q_i, 1 - q_i]))
        #print(len(_klds))
        #_avg=np.average(_klds)
        return klds  #_avg

    def _multi_prevalence(self, y):
        prevalence = []
        prevalence_smooth = []
        eps = 1 / (2 * y.shape[0])
        if isinstance(y, csr_matrix):
            for _col in range(y.shape[1]):
                prevalence.append(y.getcol(_col).nnz)
            prevalence = prevalence / (np.sum(prevalence))
            for _val in prevalence:  # perform smoothing
                prevalence_smooth.append(_val + eps)
            prevalence_smooth = prevalence_smooth / (np.sum(prevalence) +
                                                     eps * y.shape[1])
        elif isinstance(y, np.ndarray):
            if len(y.shape) == 1:
                yt = MultiLabelBinarizer(classes=self.classes).fit_transform(
                    [[y_p] for y_p in y]).transpose()
                for col in yt:
                    prevalence.append(np.sum(col))
                prevalence = prevalence / (np.sum(prevalence))
                for val in prevalence:  # perform smoothing
                    prevalence_smooth.append(val + eps)
                prevalence_smooth = prevalence_smooth / (np.sum(prevalence) +
                                                         eps * yt.shape[0])
            elif len(y.shape) == 2:
                yt = y.transpose()
                for col in yt:
                    prevalence.append(np.sum(col))
                prevalence = prevalence / (np.sum(prevalence))
                for val in prevalence:  # perform smoothing
                    prevalence_smooth.append(val + eps)
                prevalence_smooth = prevalence_smooth / (np.sum(prevalence) +
                                                         eps * yt.shape[0])
        return prevalence_smooth

    def _bin_prevalence(self, y):
        prevalence = []
        if isinstance(y, csr_matrix):
            eps = 1 / (2 * y.shape[0])
            for col in range(y.shape[1]):
                prevalence.append((y.getcol(col).nnz + eps) /
                                  (eps * y.shape[1] + y.shape[0]))
            prevalence = np.asarray(prevalence, dtype=np.float)
        elif isinstance(y, list):
            eps = 1 / (2 * len(y))
            yt = MultiLabelBinarizer(classes=self.classes).fit_transform(
                [[y_p] for y_p in y]).transpose()
            for col in range(yt.shape[0]):
                prevalence.append((np.sum(yt[col]) + eps) /
                                  (eps * yt.shape[0] + yt.shape[1]))
            prevalence = np.asarray(prevalence, dtype=np.float)
        elif isinstance(y, np.ndarray):
            eps = 1 / (2 * y.shape[0])
            if len(y.shape) == 1:
                #print(self.classes, 'Variable "y" should have more then 1 dimension. Use MultiLabelBinarizer()')
                yt = MultiLabelBinarizer(classes=self.classes).fit_transform(
                    [[y_p] for y_p in y]).transpose()
            elif len(y.shape) == 2:
                yt = y.transpose()
            for col in range(yt.shape[0]):
                prevalence.append((np.sum(yt[col]) + eps) /
                                  (eps * yt.shape[0] + yt.shape[1]))
            prevalence = np.asarray(prevalence, dtype=np.float)
        return prevalence

    def _bin_prevalence_prob(self, y):
        y = np.asarray(y, dtype=np.float).T
        eps = 1 / (2 * y.shape[1])
        prevalence = []
        print(self.model.intercept_[0][0], self.model.coef_)
        for col in y:
            nnz = 0
            for elem in col:
                if elem >= self.model.intercept_:
                    nnz += 1
            prevalence.append((nnz + eps) / (eps * y.shape[0] + y.shape[1]))
        return prevalence

    def __clean_dir(self, dir):
        for name in os.listdir(dir):
            file = os.path.join(dir, name)
            if not os.path.islink(file) and not os.path.isdir(file):
                os.remove(file)

    def __split_by_prevalence(self):
        [csr, y, y_names] = self._read_pickle(self._train_file)
        _prevalence = self.method_prev(y)
        ly = y.shape[1]
        _VLP = []
        _LP = []
        _HP = []
        _VHP = []
        _col = 0
        for _val in _prevalence:
            if _val < 0.01:
                for _i in range(4):
                    _VLP.append(_col + ly * _i)
            elif _val >= 0.01 and _val < 0.05:
                for _i in range(4):
                    _LP.append(_col + ly * _i)
            elif _val >= 0.05 and _val < 0.1:
                for _i in range(4):
                    _HP.append(_col + ly * _i)
            elif _val >= 0.1:
                for _i in range(4):
                    _VHP.append(_col + ly * _i)
            _col += 1
        return [0, _VLP, _LP, _HP, _VHP]

    def __split_by_distribution_drift(self):  #pickle_QuantRCV1
        [csr, y, y_names] = self._read_pickle(self._train_file)
        pr_train = self.method_prev(y)
        _arrange = []
        j = 0
        for _test_file in self._test_files:
            [csr1, y1, y1_names] = self._read_pickle(_test_file)
            pr_test = self.method_prev(y1)
            #_arrange.append((_j,self.kld_bin(pr_test, pr_train)))
            for _i in range(len(pr_train)):
                _arrange.append((j,
                                 self._kld([pr_test[_i], 1 - pr_test[_i]],
                                           [pr_train[_i], 1 - pr_train[_i]])))
                j = j + 1
        _arrange_sorted = sorted(_arrange, key=operator.itemgetter(1))
        _VLD = [_x[0] for _x in _arrange_sorted[:len(y_names)]]
        _LD = [_x[0] for _x in _arrange_sorted[len(y_names):2 * len(y_names)]]
        _HD = [
            _x[0] for _x in _arrange_sorted[2 * len(y_names):3 * len(y_names)]
        ]
        _VHD = [_x[0] for _x in _arrange_sorted[3 * len(y_names):]]
        return [_arrange, _VLD, _LD, _HD, _VHD]

    def _read_pickle(self, file):
        print('Read file ' + file)
        with open(file, 'rb') as f:
            data = pickle.load(f)
            f.close()
        return data

    def _estimate_cl_indexes(self):  #pickle_QuantRCV1
        #[_csr, _y, y_names]=self._read_pickle(self._train_file)
        #_prev_train=self.count_prevalence(_y)
        #model=self.arff.fit(_csr,_y)
        _pr_list = []
        _y1_list = []
        for _test_file in self._test_files:
            [_csr1, _y1, _y1_names] = self._read_pickle(_test_file)
            _y1_list.append(_y1)
            _pr_list.append(self.model.predict(_csr1))
        with open(self.prefix + 'cl_indexes_' + self.dir_name + '.pickle',
                  'wb') as f:
            print(self.prefix + 'cl_indexes_' + self.dir_name + '.pickle')
            pickle.dump([_y, _y1_list, _pr_list, _test_files, y_names], f)
            f.close()
        names_ = [_y, _y1_list, _pr_list, _test_files, y_names]
        return names_

    def __subset(self, _inp_set, _indexes):
        _sub_set = []
        for _i in _indexes:
            _sub_set.append(_inp_set[_i])
        #_sub_set=_sub_set/np.sum(_sub_set)
        return _sub_set

    def __count_splited_KLD(self, _part, _prev_test, _prev_test_estimate):
        split_by = [
            np.average(
                self._divergence_bin(
                    self.__subset(_prev_test, _part[1]),
                    self.__subset(_prev_test_estimate, _part[1]))),
            np.average(
                self._divergence_bin(
                    self.__subset(_prev_test, _part[2]),
                    self.__subset(_prev_test_estimate, _part[2]))),
            np.average(
                self._divergence_bin(
                    self.__subset(_prev_test, _part[3]),
                    self.__subset(_prev_test_estimate, _part[3]))),
            np.average(
                self._divergence_bin(
                    self.__subset(_prev_test, _part[4]),
                    self.__subset(_prev_test_estimate, _part[4]))),
            np.average(self._divergence_bin(_prev_test, _prev_test_estimate))
        ]
        return split_by

    def __count_ttest(self, _prev_test, _prev_test_estimate1,
                      _prev_test_estimate2):
        _kld_1 = self._divergence_bin(_prev_test, _prev_test_estimate1)
        _kld_2 = self._divergence_bin(_prev_test, _prev_test_estimate2)
        tt = stats.ttest_rel(_kld_1, _kld_2)
        return tt

    def _classify_and_count(self, _y_test):
        #_prev_test=[]
        #for _y_test in y_list:# Test files loop
        #    if is_prob:
        #        _prev_test=np.concatenate((_prev_test,self.method_prev(_y_test)), axis=1)
        #    else:
        #        _prev_test=np.concatenate((_prev_test,self.method_prev(_y_test)), axis=1)
        _prev_test = self.method_prev(_y_test)
        return _prev_test

    def _count_diff1(self, _prev_test, _prev_test_estimate, _num_iter):
        _parts_P = self.__split_by_prevalence()
        _parts_D = self.__split_by_distribution_drift()
        kld_bin = self._divergence_bin(_prev_test, _prev_test_estimate)
        print('\t\t\t VLP \t\t\t LP \t\t\t HP \t\t\t VHP \t\t\t total')
        print(np.average(self.__subset(kld_bin, _parts_P[1])), np.average(self.__subset(kld_bin,_parts_P[2])),\
        np.average(self.__subset(kld_bin,_parts_P[3])), np.average(self.__subset(kld_bin,_parts_P[4])), np.average(kld_bin))
        print('\t\t\t VLD \t\t\t LD \t\t\t HD \t\t\t VHD \t\t\t total')
        print(np.average(self.__subset(kld_bin, _parts_D[1])), np.average(self.__subset(kld_bin,_parts_D[2])),\
        np.average(self.__subset(kld_bin,_parts_D[3])), np.average(self.__subset(kld_bin,_parts_D[4])), np.average(kld_bin))
        print('\t\t\t VLP \t\t\t LP \t\t\t HP \t\t\t VHP \t\t\t total')
        print(np.average(self.__subset(_num_iter, _parts_P[1])), np.average(self.__subset(_num_iter,_parts_P[2])),\
        np.average(self.__subset(_num_iter,_parts_P[3])), np.average(self.__subset(_num_iter,_parts_P[4])), np.average(_num_iter))
        print('\t\t\t VLD \t\t\t LD \t\t\t HD \t\t\t VHD \t\t\t total')
        print(np.average(self.__subset(_num_iter, _parts_D[1])), np.average(self.__subset(_num_iter,_parts_D[2])),\
        np.average(self.__subset(_num_iter,_parts_D[3])), np.average(self.__subset(_num_iter,_parts_D[4])), np.average(_num_iter))
        return 0

    def _count_diff(self, _prev_test, _prev_test_estimate):
        _parts_D = self.__split_by_distribution_drift()
        _parts_P = self.__split_by_prevalence()
        #print(len(_parts_P[1]),len(_parts_P[2]),len(_parts_P[3]),len(_parts_P[4]))
        _kld_P = self.__count_splited_KLD(_parts_P, _prev_test,
                                          _prev_test_estimate)
        print(
            '\t\t\t\t VLP \t\t\t\t LP \t\t\t\t HP \t\t\t\t VHP \t\t\t\t total \n',
            _kld_P)
        _kld_D = self.__count_splited_KLD(_parts_D, _prev_test,
                                          _prev_test_estimate)
        print(
            '\t\t\t\t VLD \t\t\t\t LD \t\t\t\t HD \t\t\t\t VHD \t\t\t\t total \n',
            _kld_D)
        return _kld_P[4]

    def _unite_cl_prob(self):
        #read probabilities from separate files and aggregate it to one file
        [_csr, _y, y_names] = self._read_pickle(self._train_file)
        _train_file, _test_files = self.arff.read_dir(self.prefix +
                                                      'cl_prob_' +
                                                      self.dir_name)
        _prob_list = []
        for _test_file in _test_files:
            with open(_test_file, 'rb') as f:
                _prob = pickle.load(f)
                f.close()
            _prob_list.append(_prob)
        _y1_list = []
        for _test_file1 in self._test_files:
            [_csr1, _y1, _y1_names] = self._read_pickle(_test_file1)
            _y1_list.append(_y1)
        with open('texts/cl_prob_' + self.dir_name + '.pickle', 'wb') as f:
            pickle.dump(
                [_y, _y1_list, _prob_list, self._test_files, _y1_names], f)
            f.close()
        return [_y, _y1_list, _prob_list, self._test_files, _y1_names]

    def _estimate_cl_prob(self):
        try:
            with open('texts/ml_model_' + self.dir_name + '.pickle',
                      'rb') as f:
                self.model = pickle.load(f)
                f.close()
        except:
            [_csr, _y, y_names] = self._read_pickle(self._train_file)
            _prev_train = self.count_prevalence(_y)
            model = self.model  #self.arff.fit(_csr,_y)
            with open('texts/ml_model_' + self.dir_name + '.pickle',
                      'wb') as f:
                pickle.dump(model, f)
                f.close()

        _prob_list = []
        _y1_list = []
        for _t in range(len(self._test_files)):  # range(42,52):
            _test_file = self._test_files[_t]
            [_csr1, _y1, _y1_names] = self._read_pickle(_test_file)
            _y1_list.append(_y1)
            _prob = model.predict_proba(_csr1)
            _prob_list.append(_prob)
            with open(
                    'texts/cl_prob_' +
                    _test_file.rstrip('.arff.pickle').lstrip('texts/pickle_') +
                    '.cl_prob', 'wb') as f:
                pickle.dump(_prob, f)
                f.close()
        with open('texts/cl_prob_' + self.dir_name + '.pickle', 'wb') as f:
            pickle.dump(
                [_y, _y1_list, _prob_list, self._test_files, _y1_names], f)
            f.close()
        return [_y, _y1_list, _prob_list, self._test_files, y_names]

    def _prob_classify_and_count(self, pred_prob):
        #avr_prob=[]
        #for pred_prob in pred_prob_list:
        #    avr_prob=np.concatenate((avr_prob,np.average(pred_prob, axis=0)))
        #print('PCC',avr_prob)
        return np.average(pred_prob, axis=0)

    def _exp_max(self, y_train, pred_prob, stop_delta=0.1):
        pr_train = self._bin_prevalence(y_train)
        pr_all = []
        pr_s = pr_train.copy()
        prob_t = pred_prob.T
        prob_t_s = prob_t.copy()
        delta = 1
        delta_s = 1
        count = 0
        while delta > stop_delta and delta <= delta_s and count < 100:
            for cl_n in range(len(pr_train)):  #Category
                prob_t_s[cl_n] = prob_t[cl_n].copy() * (
                    pr_s[cl_n] / pr_train[cl_n])  #E step
            prob_t_s = normalize(prob_t_s, norm='l1', axis=0)  #E step
            pr_s1 = np.average(prob_t_s, axis=1)  #M step
            #pr_s1=self._adj_classify_and_count([prob_t_s.transpose()],is_prob=True)
            delta_s = delta
            #delta=np.max(np.abs(pr_s1-pr_s))
            delta = self._ae(pr_s, pr_s1)
            #print('pr_s1',pr_s1, delta)
            #print(prob_t_s)
            #pr_train=pr_s.copy()
            #prob_t=prob_t_s.copy()
            pr_s = pr_s1.copy()
            count = count + 1
        if np.max(pr_s) > 0.99: pr_s = np.average(prob_t, axis=1)
        return pr_s

    def _expectation_maximization(self,
                                  y_train,
                                  pred_prob,
                                  stop_delta=0.1):  #_indexes
        #[y_train, y_test_list, pred_prob_list, test_files, y_names]=_indexes
        #print(pred_prob_list[0][1])
        pr_train = self._bin_prevalence(y_train)
        pr_all = []
        num_iter = []
        test_num = 0  #0..3 len(_y_test_list)
        pr_c = pr_train.copy()

        prob = pred_prob.T
        for cl_n in range(len(pr_train)):  #Category
            #print('Test set N %s, class number %s' %(test_num, cl_n))
            iter = 0
            _delta = 1
            while _delta > stop_delta:
                pr_c_x = []
                _j = 0
                for pr_c_xk in prob[cl_n]:  #xk in category c
                    #Step E
                    pr_c_x_k = (pr_c[cl_n] / pr_train[cl_n] * pr_c_xk) / (
                        ((1 - pr_c[cl_n]) / (1 - pr_train[cl_n])) *
                        (1 - pr_c_xk) + pr_c[cl_n] / pr_train[cl_n] * pr_c_xk)
                    pr_c_x.append(pr_c_x_k)
                    _j += 1
            #Step M
                pr_c_new = np.average(pr_c_x)  #np.average(_prob[cl_n])
                _delta = np.abs(pr_c_new - pr_c[cl_n])
                #print('_delta',_delta)
                #pr_train[cl_n]=pr_c[cl_n]
                #prob[cl_n]=pr_c_x_k
                pr_c[cl_n] = pr_c_new
                iter += 1
            num_iter.append(iter)
            if np.max([pr_c[cl_n], 1 - pr_c[cl_n]]) > 0.99:
                pr_c[cl_n] = np.average(prob[cl_n])
        return pr_c  #,num_iter

    def _cost_sens_learning(self,
                            X_test,
                            stop_delta=0.00001,
                            class_weight_start='auto'):
        pred_prev_train = self._classify_and_count(self.y_train)
        pred_prev0 = pred_prev_train.copy()
        model = self.__classificator(
            class_weight=class_weight_start)  #class_weight={0:1,1:1})##
        model.fit(self.X_train, self.y_train)
        pred_prev1 = np.average(model.predict_proba(X_test), axis=0)  #
        #pred_prev1=self._classify_and_count(model.predict(X_test))
        delta1 = 0
        delta2 = 0
        d_delta1 = 0
        d_delta2 = 0
        for i in range(10):
            #print('pred_prev0',pred_prev0)
            #print('pred_prev1',pred_prev1)
            #print(pred_prev1/pred_prev_train)
            #print(delta2)
            class_weight = dict(zip(self.classes,
                                    pred_prev1 / pred_prev_train))

            model = self.__classificator(class_weight=class_weight)
            model.fit(self.X_train, self.y_train)
            pred_prev2 = np.average(model.predict_proba(X_test), axis=0)  #
            #pred_prev2=self._classify_and_count(model.predict(X_test))#
            delta1 = delta2
            delta2 = self._ae(pred_prev1, pred_prev2)
            d_delta3 = abs(delta2 - delta1)
            if delta2 < stop_delta or d_delta3 > d_delta2 and d_delta2 > d_delta1 and d_delta1 != 0:
                #print('dd',d_delta1, d_delta2,d_delta3)
                self.iter_model = model
                break
            d_delta1 = d_delta2
            d_delta2 = d_delta3
            #print(pred_prev2[0],'\t', delta1)
            #if delta2<stop_delta:
            #    break
            pred_prev0 = pred_prev1.copy()
            pred_prev1 = pred_prev2.copy()
        #print('pred_prev1',pred_prev1)
        self.iter_model = model
        return pred_prev1

    def __conditional_probability(self, p1, p2, val1, val2):
        c = 0
        for _i in range(len(p1)):
            if p1[_i] == val1 and p2[_i] == val2:
                c = c + 1
        return c / len(p1)

    def __kfold_tp_fp(self, X, y, n_folds=2):
        #return true positive rate and false positive rate arrays

        #if isinstance(X, csr_matrix) and isinstance(y, csr_matrix):
        #    X=X.toarray()
        #    y=y.toarray()
        #elif isinstance(X, csr_matrix) and isinstance(y, np.ndarray):
        #    X=X.toarray()
        #    y=MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y])
        #elif isinstance(X, np.ndarray) and isinstance(y, np.ndarray):
        #    if len(y.shape)==1:
        #        y=MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y])
        #    elif len(y.shape)==2:
        #        pass
        if isinstance(y, list):
            y = np.asarray(y)

        try:
            with open(
                    self.prefix + self.dir_name + '/' + str(n_folds) +
                    'FCV.pickle', 'rb') as f:
                [tp_av, fp_av] = pickle.load(f)
        except:
            _kf = KFold(y.shape[0], n_folds=n_folds)
            tp = []
            fp = []
            for train_index, test_index in _kf:
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                model = self.model
                model = model.fit(X_train,
                                  y_train)  #arff.fit(X_train, y_train)
                y_predict = model.predict(X_test)
                tp_k = []
                fp_k = []
                if len(y.shape) == 1:
                    y_test = MultiLabelBinarizer(
                        classes=self.classes).fit_transform([[y_p]
                                                             for y_p in y_test
                                                             ])
                    y_predict = MultiLabelBinarizer(
                        classes=self.classes).fit_transform(
                            [[y_p] for y_p in y_predict])
                elif len(y.shape) == 2:
                    pass
                for s_true, s_pred in zip(y_test.T, y_predict.T):
                    tp_k.append(
                        self.__conditional_probability(
                            s_pred, s_true, 1., 1.))  #cm[0,0]/len(s_true))
                    fp_k.append(
                        self.__conditional_probability(
                            s_pred, s_true, 1.,
                            0.))  #cm[1,0]/len(s_true))#len(s_true))
                tp.append(tp_k)
                fp.append(fp_k)
            tp_av = np.asarray([np.average(tp_k) for tp_k in np.asarray(tp).T])
            fp_av = np.asarray([np.average(fp_k) for fp_k in np.asarray(fp).T])
            with open(
                    self.prefix + self.dir_name + '/' + str(n_folds) +
                    'FCV.pickle', 'wb') as f:
                pickle.dump([tp_av, fp_av], f)
                f.close()
            #print('[tp_av, fp_av] by index',tp_av, fp_av)
        return [tp_av, fp_av]

    def __kfold_prob_tp_fp(self, X, y, n_folds=2):
        # if isinstance(X, csr_matrix) and isinstance(y, np.ndarray):
        #     X=X.toarray()
        # elif isinstance(X, np.ndarray) and isinstance(y, np.ndarray):
        #     if len(y.shape)==1:
        #         y=MultiLabelBinarizer(classes=self.classes).fit_transform([[y_p] for y_p in y])
        #     elif len(y.shape)==2:
        #         pass
        if isinstance(y, list):
            y = np.asarray(y)

        try:
            with open(
                    self.prefix + self.dir_name + '/' + str(n_folds) +
                    'FCV_prob.pickle', 'rb') as f:
                [tp_av, fp_av] = pickle.load(f)
        except:
            kf = KFold(y.shape[0], n_folds=n_folds)
            TP_avr = []
            FP_avr = []
            for train_index, test_index in kf:
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                model = self.model
                model = model.fit(X_train, y_train)
                y_predict = model.predict(X_test)
                y_prob_predict = model.predict_proba(X_test)
                TP = []
                FP = []
                if len(y.shape) == 1:
                    y_predict = MultiLabelBinarizer(
                        classes=self.classes).fit_transform(
                            [[y_p] for y_p in y_predict])
                elif len(y.shape) == 2:
                    pass
                for class_ind, class_prob in zip(y_predict.transpose(),
                                                 y_prob_predict.transpose()):
                    TP_class = []
                    FP_class = []
                    for ind, prob in zip(class_ind, class_prob):
                        if ind == 1: TP_class.append(prob)
                        elif ind == 0: FP_class.append(prob)
                    TP.append(np.sum(TP_class) / len(class_ind))
                    FP.append(np.sum(FP_class) / len(class_ind))
                TP_avr.append(TP)
                FP_avr.append(FP)
            tp_av, fp_av = np.average(TP_avr, axis=0), np.average(FP_avr,
                                                                  axis=0)
            with open(
                    self.prefix + self.dir_name + '/' + str(n_folds) +
                    'FCV_prob.pickle', 'wb') as f:
                pickle.dump([tp_av, fp_av], f)
                f.close()
            #print('tp, fp by prob', tp_av, fp_av)
        return [tp_av, fp_av]

    def _adj_classify_and_count(self, y_pred, is_prob=False):
        [tp_av, fp_av] = self.kfold_results

        if is_prob:
            pr = np.average(y_pred, axis=0)
        else:
            pr = self.method_prev(y_pred)
        try:
            pred = (pr - fp_av) / (tp_av - fp_av)
            if np.min(pred) >= 0:
                pred = normalize(pred, norm='l1', axis=1)[0]
            else:
                #print(pred)
                #print(pr,tp_av,fp_av)
                pred = pr
        except:
            print(pr, tp_av, fp_av)
            pred = pr
        return pred

    def _process_pipeline(self):
        #Warning! Processing can takes a long period. We recommend to perform it step by step
        #pa=Parse_ARFF()
        #pa.convert_arff(QuantOHSUMED, is_predict=False)
        #q=Quantification('QuantOHSUMED')
        #q.process_pipeline()
        #####################################################
        [self.X_train, self.y_train,
         y_names] = self._read_pickle(self._train_file)
        self.fit(self.X_train, self.y_train)
        #[y_train, y_test_list, y_pred_list, test_files, y_names]=self._estimate_cl_indexes()
        [y_train, y_test_list, y_pred_list, test_files, y_names
         ] = self._read_pickle('texts/cl_indexes_' + self.dir_name + '.pickle')
        td = self._classify_and_count(y_test_list)
        ed1 = self._classify_and_count(y_pred_list)

        ed2 = self._adj_classify_and_count(self.X_train, self.y_train,
                                           y_pred_list)

        self._estimate_cl_prob()
        self._unite_cl_prob()
        [y_train, y_test_list, pred_prob_list, test_files, y_names
         ] = self._read_pickle('texts/cl_prob_' + self.dir_name + '.pickle')
        ed3 = self._classify_and_count(pred_prob_list, is_prob=True)
        ed4 = self._prob_classify_and_count(pred_prob_list)
        ed5, num_iter = self._expectation_maximization(self.y_train,
                                                       pred_prob_list, 0.1)
        self._count_diff(td, ed4)
        self._count_diff1(td, ed5, num_iter)