Ejemplo n.º 1
0
    def to_path(self,
                new_path,
                flatten=False,
                add_dir="3-layer",
                pop=0,
                n_jobs=1):
        """

        Parameters
        ----------
        new_path:str
            new path
        flatten:bool,dict
            flatten the filtered file.
            if flatten is dict, the key is the specific dir name,and value is True.
            Examples:
            flatten = {"asp":True}
        add_dir:list, int
            add the top dir_name to file to escape same name file.
            only valid for flatten=True
        pop: int (negative)
            pop the last n layer. default =0
            used for copy by dir rather than files. just used for flatten=False
        n_jobs:int
            n_jobs

        Returns
        -------
            file in path.
        """
        self.file_list_merge = self.merge(pop=pop)
        new_path = def_pwd(new_path)
        self.file_list_merge_new = self.merge(path=new_path,
                                              flatten=flatten,
                                              add_dir=add_dir,
                                              refresh_file_list=False,
                                              pop=pop)
        if len(set(self.file_list_merge_new)) < len(set(self.file_list_merge)):
            raise UserWarning(
                "There are same name files after flatten folders. "
                "you can change add_dir to add difference prefix to files", )
        if n_jobs != 1:
            parallelize(n_jobs,
                        self.copy_user,
                        zip(
                            self.file_list_merge,
                            self.file_list_merge_new,
                        ),
                        mode="j",
                        respective=False)
        else:
            for ij in tqdm(
                    list(zip(self.file_list_merge, self.file_list_merge_new))):
                self.copy_user(ij)
Ejemplo n.º 2
0
    def _transform(self, structures: List[Structure], **kwargs):
        """

        Args:
            structures:(list) Preprocessing of samples need to transform to Graph.

            **kwargs:

        Returns:
            list of graphs:
                List of dict

        """
        assert isinstance(structures, Iterable)
        if hasattr(structures, "__len__"):
            assert len(structures) > 0, "Empty input data!"

        le = len(structures)

        for i in kwargs.keys():
            if kwargs[i] is None:
                kwargs[i] = [kwargs[i]] * len(structures)
            elif not isinstance(kwargs[i], Iterable):
                kwargs[i] = [kwargs[i]] * len(structures)

        kw = [{k: v[i] for k, v in kwargs.items()} for i in range(le)]

        iterables = zip(structures, kw)

        if not self.batch_calculate:
            rets = parallelize(self.n_jobs,
                               self._wrapper,
                               iterables,
                               tq=True,
                               respective=True,
                               respective_kwargs=True)
            ret, self.support_ = zip(*rets)

        else:
            rets = batch_parallelize(self.n_jobs,
                                     self._wrapper,
                                     iterables,
                                     respective=True,
                                     respective_kwargs=True,
                                     tq=True,
                                     mode="j",
                                     batch_size=self.batch_size)

            ret, self.support_ = zip(*rets)

        if self.add_label:
            [
                i.update({"label": torch.tensor([n, n])})
                for n, i in enumerate(ret)
            ]  # double for after.
        return ret
Ejemplo n.º 3
0
    def _fit(self, x, y, searchspace0, regclf0):
        def fit_parllize(random_state):
            data_train, y_train = sklearn.utils.resample(
                x, y, n_samples=None, replace=True, random_state=random_state)
            regclf0.fit(data_train, y_train)
            predict_data = regclf0.predict(searchspace0)
            predict_data.ravel()
            return predict_data

        predict_dataj = parallelize(n_jobs=self.n_jobs,
                                    func=fit_parllize,
                                    iterable=range(self.number))

        return np.array(predict_dataj).T
Ejemplo n.º 4
0
    def fit(self, searchspace=None, X=None, y=None, *args):
        """

        Parameters
        ----------
        searchspace: np.ndarray of shape (n_sample_pre, n_feature)
            searchspace with the same ``n_feature`` with X,
            Custom or generate by .search_space() function.
        X: np.ndarray of shape (n_sample_train, n_feature)
            X data (2D).
        y: np.ndarray of shape (n_sample_train, 1)
            y data (1D).

        """
        assert hasattr(self.regclf, "fit")
        assert hasattr(self.regclf, "predict")

        self.searchspace = self.searchspace if searchspace is None else searchspace
        self.X = self.X if X is None else X
        self.y = self.y if y is None else y
        searchspace = self.searchspace
        X = self.X
        y = self.y

        njobs = self.n_jobs
        regclf0 = self.regclf
        assert searchspace is not None and X is not None and y is not None, "searchspace, X, y should be np.array"
        check_array(X, ensure_2d=True, force_all_finite=True)
        check_array(y, ensure_2d=False, force_all_finite=True)
        check_array(searchspace, ensure_2d=True, force_all_finite=True)
        assert X.shape[1] == searchspace.shape[1]

        def fit_parllize(random_state):
            data_train, y_train = sklearn.utils.resample(X, y, n_samples=None, replace=True,
                                                         random_state=random_state)
            regclf0.fit(data_train, y_train)
            predict_data = regclf0.predict(searchspace)
            predict_data.ravel()
            return predict_data

        predict_y = parallelize(n_jobs=njobs, func=fit_parllize, iterable=range(self.number))
        predict_y = np.array(predict_y).T

        self.predict_y = predict_y

        self.meanandstd()
Ejemplo n.º 5
0
    def _transform(self,
                   structures: List[Structure],
                   state_attributes: List = None):
        """

        Parameters
        ----------
        structures:list
            preprocessing of samples need to transform to Graph.
        state_attributes:List
            preprocessing of samples need to add to Graph.

        Returns
        -------
        list of graphs:
            List of dict

        """

        if state_attributes is None:
            state_attributes = [None] * len(structures)
        assert isinstance(structures, Iterable)
        if hasattr(structures, "__len__"):
            assert len(structures) > 0, "Empty input data!"
        iterables = zip(structures, state_attributes)

        if not self.batch_calculate:
            rets = parallelize(self.n_jobs,
                               self._wrapper,
                               iterables,
                               tq=True,
                               respective=True)

            ret, self.support_ = zip(*rets)

        else:
            rets = batch_parallelize(self.n_jobs,
                                     self._wrapper,
                                     iterables,
                                     respective=True,
                                     tq=True,
                                     batch_size=self.batch_size)

            ret, self.support_ = zip(*rets)
        return ret
Ejemplo n.º 6
0
    def fit(self, X, y, groups=None):
        """Fit the baf model and automatically tune the number of selected feature.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_feature]
            Training vector, where `n_samples` is the number of samples and
            `n_feature` is the total number of feature.

        y : array-like, shape = [n_samples]
            Target values (integers for classification, real numbers for
            regression).

        groups : array-like, shape = [n_samples], optional
            cal_group labels for the samples used while splitting the dataset into
            train/test set.
        """
        X, y = check_X_y(X, y, "csr")
        # Initialization
        estimator = clone(self.estimator)
        scorer = check_scoring(estimator, scoring=self.scoring)
        ran = check_random_state(self.random_state)

        baf = BackForward(
            estimator=estimator,
            n_type_feature_to_select=self.n_type_feature_to_select,
            verbose=self.verbose,
            primary_feature=self.primary_feature,
            muti_grade=self.muti_grade,
            muti_index=self.muti_index,
            must_index=self.must_index,
            random_state=ran)
        rans = ran.randint(0, 1000, self.times)

        func = partial(_multi_time_fit, baf=baf, X=X, y=y, scorer=scorer)

        scores = parallelize(n_jobs=self.n_jobs,
                             func=func,
                             iterable=rans,
                             respective=False)

        support, scores, score_step = zip(*scores)
        best_support = support[np.argmax(scores)]
        best_score = max(scores)
        # Re-execute an elimination with best_k over the whole set

        # Set final attributes
        self.support_step = score_step
        self.support_ = best_support
        self.score_ = best_score
        self.estimator_ = clone(self.estimator)
        if self.refit:
            if not hasattr(self.estimator_, 'best_score_'):
                warnings.warn(
                    UserWarning(
                        "The self.estimator_ :{} used all the X,y data.".
                        format(self.estimator_.__class__.__name__),
                        "please be careful with the later 'score' and 'predict'."
                    ))
            if hasattr(self.estimator_, 'best_score_') and hasattr(self.estimator_, "refit") \
                    and self.estimator_.refit is True:
                warnings.warn(
                    UserWarning(
                        "The self.estimator_ :{} used all the X,y data.".
                        format(self.estimator_.__class__.__name__),
                        "please be careful with the later 'score' and 'predict'."
                    ))
            self.estimator_.fit(X[:, self.support_], y)
        self.n_feature_ = np.count_nonzero(support)
        return self
Ejemplo n.º 7
0
def acfng(expr01,
          x,
          y,
          init_c=None,
          terminals=None,
          c_terminals=None,
          np_maps=None,
          classification=False,
          no_gradient_coef=-1,
          no_gradient_coef_range=np.arange(-1, 1, 1),
          n_jobs=1,
          scoring="r2"):
    """
    Add coefficients with no gradient coefficient.

    Try calculate predict y by sympy expression with coefficients.
    if except error return expr itself.


    Parameters
    ----------
    scoring:str
        score in sklearn.metrics
    n_jobs: int
        parallize number
    no_gradient_coef: int,sympy.Symbol
        coefficient in no gradient function, default the last one.
        Examples:
        no_gradient_coef=sympy.Symbol("c2")
        no_gradient_coef=0
    no_gradient_coef_range:
        range of the special coef.
    expr01:sympy.Expr
        expr for fitting.
    x: list of np.ndarray or np.ndarray
        real data with: [x1,x2,x3,...,x_n_feature].
    y: np.ndarray with shape (n_sample,)
        real data of target.
    init_c: list of float or float,None
        default 1.
    terminals: List of sympy.Symbol,None
        placeholder for xi, with the same features in expr01.
    c_terminals:List of sympy.Symbol,None
        placeholder for ci, with the same coefficients/constants in expr01.
    np_maps: dict,default is None
        for self-definition.
        1. make your function with sympy.Function and arrange in in expr01.
        >>> x1, x2, x3, c1,c2,c3,c4 = sympy.symbols("x1,x2,x3,c1,c2,c3,c4")
        >>> Seg = sympy.Function("Seg")
        >>> expr01 = Seg(x1*x2)
        2. write the numpy calculation method for this function.
        >>> def np_seg(x):
        >>>     res = x
        >>>     res[res>1]=-res[res>1]
        >>>     return res
        3. pass the np_maps parameters.
        >>> np_maps = {"Seg":np_seg}

        In total, when parse the expr01, find the numpy function in sequence by:
        (np_maps -> numpy's function -> system -> Error)

    classification:bool
        classfication or not, default False.

    Returns
    -------
    pre_y:
        np.array or None
    expr01: Expr
        New expr.
    """
    expr01, x, y, init_c, terminals, c_terminals, np_maps = format_input(
        expr01, x, y, init_c, terminals, c_terminals, np_maps)
    if isinstance(no_gradient_coef, int):
        no_gradient_coef = c_terminals[no_gradient_coef]

    exprs = [
        expr01.xreplace({no_gradient_coef: i}) for i in no_gradient_coef_range
    ]

    def func(expr):
        return acfs(expr,
                    x,
                    y,
                    init_c,
                    terminals,
                    c_terminals,
                    np_maps,
                    classification=classification,
                    built_format_input=False)

    scores = parallelize(n_jobs, func=func, iterable=exprs)

    maxp = False if "neg" in scoring else True

    scores = np.array(scores)
    scores_error = ~np.isfinite(scores)
    scores[scores_error] = -np.inf if maxp else np.inf
    index = np.argmax(scores) if maxp else np.argmin(scores)
    score = scores[index]
    print(score)
    i = list(no_gradient_coef_range)[index]

    return acf(exprs[i],
               x,
               y,
               init_c,
               terminals,
               c_terminals,
               np_maps,
               classification=classification,
               built_format_input=False)
Ejemplo n.º 8
0
    def _fit(self, x, y):

        estimator = clone(self.estimator)

        def score_pri(slices, x0, y0):
            slices = list(slices)
            if len(slices) < 1:
                score0 = -np.inf
            else:
                slices = self.feature_unfold(slices)
                data_x0 = x0[:, slices]

                if hasattr(estimator, "best_score_"):
                    estimator.fit(data_x0, y0)
                    score0 = np.mean(estimator.best_score_)  # score_test

                else:
                    score0 = cross_val_score(estimator,
                                             data_x0,
                                             y0,
                                             cv=self.cv)
                    score0 = np.mean(score0)
                # print(slices, score0)
            return score0

        score = partial(score_pri, x0=x, y0=y)

        self.score_ = []
        x, y = check_X_y(x, y, "csc")
        assert all((self.check_must, self.check_muti)) in [True, False]

        feature_list = list(range(x.shape[1]))
        fold_feature_list = self.feature_fold(feature_list)
        if self.check_must:
            fold_feature_list = [
                i for i in fold_feature_list if i not in self.check_must
            ]

        slice_all = [combinations(fold_feature_list, i) for i in self.n_select]
        slice_all = [
            list(self.feature_must_fold(_)) for i in slice_all for _ in i
        ]

        scores = parallelize(n_jobs=self.n_jobs,
                             func=score,
                             iterable=slice_all)

        feature_combination = [self.feature_unfold(_) for _ in slice_all]
        index = np.argmax(scores)
        select_feature = feature_combination[int(index)]
        su = np.zeros(x.shape[1], dtype=np.bool)
        su[select_feature] = 1
        self.best_score_ = max(scores)
        self.score_ = scores
        self.support_ = su
        self.estimator_ = clone(self.estimator)
        if self.refit:
            if not hasattr(self.estimator_, 'best_score_'):
                warnings.warn(
                    UserWarning(
                        "The self.estimator_ :{} used all the X,y data.".
                        format(self.estimator_.__class__.__name__),
                        "please be careful with the later 'score' and 'predict'."
                    ))
            if hasattr(self.estimator_, 'best_score_') and hasattr(self.estimator_, "refit") \
                    and self.estimator_.refit is True:
                warnings.warn(
                    UserWarning(
                        "The self.estimator_ :{} used all the X,y data.".
                        format(self.estimator_.__class__.__name__),
                        "please be careful with the later 'score' and 'predict'."
                    ))
            self.estimator_.fit(x[:, select_feature], y)
        self.n_feature_ = len(select_feature)
        self.score_ex = list(zip(feature_combination, scores))
        self.scatter = list(zip([len(i) for i in slice_all], scores))
        self.score_ex.sort(key=lambda _: _[1], reverse=True)

        return self
Ejemplo n.º 9
0
def func(n, _=None):
    # time.sleep(0.0001)
    s = np.random.random((10, 50))
    return s


if __name__ == "__main__":

    iterable = np.arange(50)
    s0 = batch_parallelize(4,
                           func,
                           iterable,
                           respective=False,
                           tq=True,
                           mode="m")  #无tq
    s1 = parallelize(4, func, iterable, respective=False, tq=True, mode="j")
    s2 = parallelize_imap(4, func, iterable, tq=True)
    s0 = parallelize(4, func, iterable, respective=False, tq=False,
                     mode="m")  #无tq
    s1 = parallelize(4, func, iterable, respective=False, tq=False, mode="j")
    s2 = parallelize_imap(4, func, iterable, tq=False)

    def func(n, _=None):
        # time.sleep(0.0001)
        s = np.random.random((100, 50))
        return s

    iterable = np.arange(10000)

    print("samll loop and big data")
Ejemplo n.º 10
0
def multiEaSimple(population, toolbox, cxpb, mutpb, ngen, stats=None,
                  halloffame=None, verbose=__debug__, pset=None, store=True, alpha=1):
    """

    Parameters
    ----------
    population
    toolbox
    cxpb
    mutpb
    ngen
    stats
    halloffame
    verbose
    pset
    store
    alpha

    Returns
    -------

    """
    logbook = Logbook()
    logbook.header = ['gen', 'nevals'] + (stats.fields if stats else [])

    # Evaluate the individuals with an invalid fitness
    invalid_ind = [ind for ind in population if not ind.fitness.valid]
    random_seed = random.randint(1, 1000)
    # fitnesses = list(toolbox.map(toolbox.evaluate, [str(_) for _ in invalid_ind]))
    # fitnesses2 = toolbox.map(toolbox.evaluate2, [str(_) for _ in invalid_ind])
    fitnesses = parallelize(n_jobs=6, func=toolbox.evaluate, iterable=[str(_) for _ in invalid_ind])
    fitnesses2 = parallelize(n_jobs=6, func=toolbox.evaluate2, iterable=[str(_) for _ in invalid_ind])

    def funcc(a, b):
        """

        Parameters
        ----------
        a
        b

        Returns
        -------

        """
        return (alpha * a + b) / 2

    for ind, fit, fit2 in zip(invalid_ind, fitnesses, fitnesses2):
        ind.fitness.values = funcc(fit[0], fit2[0]),
        ind.values = (fit[0], fit2[0])
        ind.expr = (fit[1], fit2[1])
    if halloffame is not None:
        halloffame.update(population)
    random.seed(random_seed)
    record = stats.compile_(population) if stats else {}
    logbook.record(gen=0, nevals=len(invalid_ind), **record)
    if verbose:
        print(logbook.stream)
    data_all = {}
    # Begin the generational process
    for gen in range(1, ngen + 1):
        # select_gs the next generation individuals
        offspring = toolbox.select_gs(population, len(population))
        # Vary the pool of individuals
        offspring = varAnd(offspring, toolbox, cxpb, mutpb)
        if halloffame is not None:
            offspring.extend(halloffame.items[-2:])

        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        random_seed = random.randint(1, 1000)
        # fitnesses = toolbox.map(toolbox.evaluate, [str(_) for _ in invalid_ind])
        # fitnesses2 = toolbox.map(toolbox.evaluate2, [str(_) for _ in invalid_ind])
        fitnesses = parallelize(n_jobs=6, func=toolbox.evaluate, iterable=[str(_) for _ in invalid_ind])
        fitnesses2 = parallelize(n_jobs=6, func=toolbox.evaluate2, iterable=[str(_) for _ in invalid_ind])

        for ind, fit, fit2 in zip(invalid_ind, fitnesses, fitnesses2):
            ind.fitness.values = funcc(fit[0], fit2[0]),
            ind.values = (fit[0], fit2[0])
            ind.expr = (fit[1], fit2[1])

        # Update the hall of fame with the generated individuals
        if halloffame is not None:
            halloffame.update(offspring)
            if halloffame.items[-1].fitness.values[0] >= 0.95:
                print(halloffame.items[-1])
                print(halloffame.items[-1].fitness.values[0])
                print(halloffame.items[-1].values[0])
                print(halloffame.items[-1].values[1])
                break

        if store:
            if pset:
                subp = partial(sub, subed=pset.rep_name_list, subs=pset.name_list)
                data = [{"score": i.values[0], "expr": subp(i.expr[0])} for i in halloffame.items[-2:]]
                data2 = [{"score": i.values[1], "expr": subp(i.expr[1])} for i in halloffame.items[-2:]]
            else:
                data = [{"score": i.values[0], "expr": i.expr} for i in halloffame.items[-2:]]
                data2 = [{"score": i.values[1], "expr": i.expr[2]} for i in halloffame.items[-2:]]
            data_all['gen%s' % gen] = list(zip(data, data2))
        random.seed(random_seed)
        # Replace the current population by the offspring
        population[:] = offspring
        # Append the current generation statistics to the logbook
        record = stats.compile_(population) if stats else {}
        logbook.record(gen=gen, nevals=len(invalid_ind), **record)
        if verbose:
            print(logbook.stream)
    if store:
        store1 = Store()
        store1.to_txt(data_all)

    return population, logbook