Ejemplo n.º 1
0
def main(opt, flg):
    # import functions which depend on sklearn only after parser run
    from ml_functions import (balance, explorer_clsfiers, run_classifier,
                          optimize_training, explore_SVC, plot_grid)
    from features import importances, tocsv

    msgr = get_msgr()
    indexes = None
    vect = opt['vector']
    vtraining = opt['vtraining'] if opt['vtraining'] else None
    scaler, decmp = None, None
    vlayer = opt['vlayer'] if opt['vlayer'] else vect + '_stats'
    tlayer = opt['tlayer'] if opt['tlayer'] else vect + '_training'
    rlayer = opt['rlayer'] if opt['rlayer'] else vect + '_results'

    labels = extract_classes(vtraining, 1)
    pprint(labels)

    if opt['scalar']:
        scapar = opt['scalar'].split(',')
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler(with_mean='with_mean' in scapar,
                                with_std='with_std' in scapar)

    if opt['decomposition']:
        dec, params = (opt['decomposition'].split('|')
                       if '|' in opt['decomposition']
                       else (opt['decomposition'], ''))
        kwargs = ({k: v for k, v in (p.split('=') for p in params.split(','))}
                  if params else {})
        load_decompositions()
        decmp = DECMP[dec](**kwargs)

    # if training extract training
    if vtraining and flg['e']:
        msgr.message("Extract training from: <%s> to <%s>." % (vtraining, vect))
        extract_training(vect, vtraining, tlayer)
        flg['n'] = True

    if flg['n']:
        msgr.message("Save arrays to npy files.")
        save2npy(vect, vlayer, tlayer,
                 fcats=opt['npy_cats'], fcols=opt['npy_cols'],
                 fdata=opt['npy_data'], findx=opt['npy_index'],
                 fclss=opt['npy_tclasses'], ftdata=opt['npy_tdata'])

    # define the classifiers to use/test
    if opt['pyclassifiers'] and opt['pyvar']:
        # import classifiers to use
        mycls = imp.load_source("mycls", opt['pyclassifiers'])
        classifiers = getattr(mycls, opt['pyvar'])
    else:
        from ml_classifiers import CLASSIFIERS
        classifiers = CLASSIFIERS

    # Append the SVC classifier
    if opt['svc_c'] and opt['svc_gamma']:
            from sklearn.svm import SVC
            svc = {'name': 'SVC', 'classifier': SVC,
                   'kwargs': {'C': float(opt['svc_c']),
                              'gamma': float(opt['svc_gamma']),
                              'kernel': opt['svc_kernel']}}
            classifiers.append(svc)

    # extract classifiers from pyindx
    if opt['pyindx']:
        indexes = [i for i in get_indexes(opt['pyindx'])]
        classifiers = [classifiers[i] for i in indexes]

    num = int(opt['n_training']) if opt['n_training'] else None

    # load fron npy files
    Xt = np.load(opt['npy_tdata'])
    Yt = np.load(opt['npy_tclasses'])
    cols = np.load(opt['npy_cols'])

    # Define rules to substitute NaN, Inf, posInf, negInf values
    rules = {}
    for key in ('nan', 'inf', 'neginf', 'posinf'):
        if opt[key]:
            rules[key] = get_rules(opt[key])
    pprint(rules)

    # Substitute (skip cat column)
    Xt, rules_vals = substitute(Xt, rules, cols[1:])
    Xtoriginal = Xt

    # scale the data
    if scaler:
        msgr.message("Scaling the training data set.")
        scaler.fit(Xt, Yt)
        Xt = scaler.transform(Xt)

    # decompose data
    if decmp:
        msgr.message("Decomposing the training data set.")
        decmp.fit(Xt)
        Xt = decmp.transform(Xt)

    # Feature importances with forests of trees
    if flg['f']:
        np.save('training_transformed.npy', Xt)
        importances(Xt, Yt, cols[1:],
                    csv=opt['imp_csv'], img=opt['imp_fig'],
                    # default parameters to save the matplotlib figure
                    **dict(dpi=300, transparent=False, bbox_inches='tight'))

    # optimize the training set
    if flg['o']:
        ind_optimize = (int(opt['pyindx_optimize']) if opt['pyindx_optimize']
                        else 0)
        cls = classifiers[ind_optimize]
        msgr.message("Find the optimum training set.")
        best, Xbt, Ybt = optimize_training(cls, Xt, Yt,
                                           labels, #{v: k for k, v in labels.items()},
                                           scaler, decmp,
                                           num=num, maxiterations=1000)
        msg = "    - save the optimum training data set to: %s."
        msgr.message(msg % opt['npy_btdata'])
        np.save(opt['npy_btdata'], Xbt)
        msg = "    - save the optimum training classes set to: %s."
        msgr.message(msg % opt['npy_btclasses'])
        np.save(opt['npy_btclasses'], Ybt)

    # balance the data
    if flg['b']:
        msg = "Balancing the training data set, each class have <%d> samples."
        msgr.message(msg % num)
        Xbt, Ybt = balance(Xt, Yt, num)
    else:
        if not flg['o']:
            Xbt = (np.load(opt['npy_btdata'])
                   if os.path.isfile(opt['npy_btdata']) else Xt)
            Ybt = (np.load(opt['npy_btclasses'])
                   if os.path.isfile(opt['npy_btclasses']) else Yt)

    # scale the data
    if scaler:
        msgr.message("Scaling the training data set.")
        scaler.fit(Xbt, Ybt)
        Xt = scaler.transform(Xt)
        Xbt = scaler.transform(Xbt)

    if flg['d']:
        C_range = [float(c) for c in opt['svc_c_range'].split(',') if c]
        gamma_range = [float(g) for g in opt['svc_gamma_range'].split(',') if g]
        kernel_range = [str(s) for s in opt['svc_kernel_range'].split(',') if s]
        poly_range = [int(i) for i in opt['svc_poly_range'].split(',') if i]
        allkwargs = dict(C=C_range, gamma=gamma_range,
                         kernel=kernel_range, degree=poly_range)
        kwargs = {}
        for k in allkwargs:
            if allkwargs[k]:
                kwargs[k] = allkwargs[k]
        msgr.message("Exploring the SVC domain.")
        grid = explore_SVC(Xbt, Ybt, n_folds=5, n_jobs=int(opt['svc_n_jobs']),
                           **kwargs)
        import pickle
        krnlstr = '_'.join(s for s in opt['svc_kernel_range'].split(',') if s)
        pkl = open('grid%s.pkl' % krnlstr, 'w')
        pickle.dump(grid, pkl)
        pkl.close()
#        pkl = open('grid.pkl', 'r')
#        grid = pickle.load(pkl)
#        pkl.close()
        plot_grid(grid, save=opt['svc_img'])

    # test the accuracy of different classifiers
    if flg['t']:
        # test different classifiers
        msgr.message("Exploring different classifiers.")
        msgr.message("cls_id   cls_name          mean     max     min     std")

        res = explorer_clsfiers(classifiers, Xt, Yt, labels=labels,
                                indexes=indexes, n_folds=5,
                                bv=flg['v'], extra=flg['x'])
        # TODO: sort(order=...) is working only in the terminal, why?
        #res.sort(order='mean')
        with open(opt['csv_test_cls'], 'w') as csv:
            csv.write(tocsv(res))

    if flg['c']:
        # classify
        data = np.load(opt['npy_data'])
        indx = np.load(opt['npy_index'])

        # Substitute using column values
        data, dummy = substitute(data, rules, cols[1:])
        Xt = data[indx]

        if scaler:
            msgr.message("Scaling the training data set.")
            scaler.fit(Xt, Yt)
            Xt = scaler.transform(Xt)
            msgr.message("Scaling the whole data set.")
            data = scaler.transform(data)
        if decmp:
            msgr.message("Decomposing the training data set.")
            decmp.fit(Xt)
            Xt = decmp.transform(Xt)
            msgr.message("Decompose the whole data set.")
            data = decmp.transform(data)
        cats = np.load(opt['npy_cats'])

        np.save('data_filled_scaled.npy', data)
        tcols = []
        for cls in classifiers:
            report = (open(opt['report_class'], "w")
                      if opt['report_class'] else sys.stdout)
            run_classifier(cls, Xt, Yt, Xt, Yt, labels, data,
                           report=report)
            tcols.append((cls['name'], 'INTEGER'))

        import pickle
        with open('classification_results.pkl', 'w') as res:
              pickle.dump(classifiers, res)
        #classifiers = pickle.load(res)
        msgr.message("Export the results to layer: <%s>" % str(rlayer))
        export_results(vect, classifiers, cats, rlayer, vtraining, tcols,
                       overwrite(), pkl='res.pkl', append=flg['a'])
#        res.close()

    if flg['r']:
        rules = ('\n'.join(['%d %s' % (k, v)
                            for k, v in get_colors(vtraining).items()])
                 if vtraining else None)

        msgr.message("Export the layer with results to raster")
        with Vector(vect, mode='r') as vct:
            tab = vct.dblinks.by_name(rlayer).table()
            rasters = [c for c in tab.columns]
            rasters.remove(tab.key)

        v2rst = Module('v.to.rast')
        rclrs = Module('r.colors')
        for rst in rasters:
            v2rst(input=vect, layer=rlayer, type='area',
                  use='attr', attrcolumn=rst.encode(),
                  output=(opt['rst_names'] % rst).encode(),
                  memory=1000, overwrite=overwrite())
            if rules:
                rclrs(map=rst.encode(), rules='-', stdin_=rules)
Ejemplo n.º 2
0
def main(opt, flg):
    # import functions which depend on sklearn only after parser run
    from ml_functions import (
        balance,
        explorer_clsfiers,
        run_classifier,
        optimize_training,
        explore_SVC,
        plot_grid,
    )
    from features import importances, tocsv

    msgr = get_msgr()
    indexes = None
    vect = opt["vector"]
    vtraining = opt["vtraining"] if opt["vtraining"] else None
    scaler, decmp = None, None
    vlayer = opt["vlayer"] if opt["vlayer"] else vect + "_stats"
    tlayer = opt["tlayer"] if opt["tlayer"] else vect + "_training"
    rlayer = opt["rlayer"] if opt["rlayer"] else vect + "_results"

    labels = extract_classes(vtraining, 1)
    pprint(labels)

    if opt["scalar"]:
        scapar = opt["scalar"].split(",")
        from sklearn.preprocessing import StandardScaler

        scaler = StandardScaler(with_mean="with_mean" in scapar,
                                with_std="with_std" in scapar)

    if opt["decomposition"]:
        dec, params = (opt["decomposition"].split("|") if "|"
                       in opt["decomposition"] else (opt["decomposition"], ""))
        kwargs = ({k: v
                   for k, v in (p.split("=")
                                for p in params.split(","))} if params else {})
        load_decompositions()
        decmp = DECMP[dec](**kwargs)

    # if training extract training
    if vtraining and flg["e"]:
        msgr.message("Extract training from: <%s> to <%s>." %
                     (vtraining, vect))
        extract_training(vect, vtraining, tlayer)
        flg["n"] = True

    if flg["n"]:
        msgr.message("Save arrays to npy files.")
        save2npy(
            vect,
            vlayer,
            tlayer,
            fcats=opt["npy_cats"],
            fcols=opt["npy_cols"],
            fdata=opt["npy_data"],
            findx=opt["npy_index"],
            fclss=opt["npy_tclasses"],
            ftdata=opt["npy_tdata"],
        )

    # define the classifiers to use/test
    if opt["pyclassifiers"] and opt["pyvar"]:
        # import classifiers to use
        mycls = SourceFileLoader("mycls", opt["pyclassifiers"]).load_module()
        classifiers = getattr(mycls, opt["pyvar"])
    else:
        from ml_classifiers import CLASSIFIERS

        classifiers = CLASSIFIERS

    # Append the SVC classifier
    if opt["svc_c"] and opt["svc_gamma"]:
        from sklearn.svm import SVC

        svc = {
            "name": "SVC",
            "classifier": SVC,
            "kwargs": {
                "C": float(opt["svc_c"]),
                "gamma": float(opt["svc_gamma"]),
                "kernel": opt["svc_kernel"],
            },
        }
        classifiers.append(svc)

    # extract classifiers from pyindx
    if opt["pyindx"]:
        indexes = [i for i in get_indexes(opt["pyindx"])]
        classifiers = [classifiers[i] for i in indexes]

    num = int(opt["n_training"]) if opt["n_training"] else None

    # load fron npy files
    Xt = np.load(opt["npy_tdata"])
    Yt = np.load(opt["npy_tclasses"])
    cols = np.load(opt["npy_cols"])

    # Define rules to substitute NaN, Inf, posInf, negInf values
    rules = {}
    for key in ("nan", "inf", "neginf", "posinf"):
        if opt[key]:
            rules[key] = get_rules(opt[key])
    pprint(rules)

    # Substitute (skip cat column)
    Xt, rules_vals = substitute(Xt, rules, cols[1:])
    Xtoriginal = Xt

    # scale the data
    if scaler:
        msgr.message("Scaling the training data set.")
        scaler.fit(Xt, Yt)
        Xt = scaler.transform(Xt)

    # decompose data
    if decmp:
        msgr.message("Decomposing the training data set.")
        decmp.fit(Xt)
        Xt = decmp.transform(Xt)

    # Feature importances with forests of trees
    if flg["f"]:
        np.save("training_transformed.npy", Xt)
        importances(
            Xt,
            Yt,
            cols[1:],
            csv=opt["imp_csv"],
            img=opt["imp_fig"],
            # default parameters to save the matplotlib figure
            **dict(dpi=300, transparent=False, bbox_inches="tight"),
        )

    # optimize the training set
    if flg["o"]:
        ind_optimize = int(
            opt["pyindx_optimize"]) if opt["pyindx_optimize"] else 0
        cls = classifiers[ind_optimize]
        msgr.message("Find the optimum training set.")
        best, Xbt, Ybt = optimize_training(
            cls,
            Xt,
            Yt,
            labels,  # {v: k for k, v in labels.items()},
            scaler,
            decmp,
            num=num,
            maxiterations=1000,
        )
        msg = "    - save the optimum training data set to: %s."
        msgr.message(msg % opt["npy_btdata"])
        np.save(opt["npy_btdata"], Xbt)
        msg = "    - save the optimum training classes set to: %s."
        msgr.message(msg % opt["npy_btclasses"])
        np.save(opt["npy_btclasses"], Ybt)

    # balance the data
    if flg["b"]:
        msg = "Balancing the training data set, each class have <%d> samples."
        msgr.message(msg % num)
        Xbt, Ybt = balance(Xt, Yt, num)
    else:
        if not flg["o"]:
            Xbt = (np.load(opt["npy_btdata"])
                   if os.path.isfile(opt["npy_btdata"]) else Xt)
            Ybt = (np.load(opt["npy_btclasses"])
                   if os.path.isfile(opt["npy_btclasses"]) else Yt)

    # scale the data
    if scaler:
        msgr.message("Scaling the training data set.")
        scaler.fit(Xbt, Ybt)
        Xt = scaler.transform(Xt)
        Xbt = scaler.transform(Xbt)

    if flg["d"]:
        C_range = [float(c) for c in opt["svc_c_range"].split(",") if c]
        gamma_range = [
            float(g) for g in opt["svc_gamma_range"].split(",") if g
        ]
        kernel_range = [
            str(s) for s in opt["svc_kernel_range"].split(",") if s
        ]
        poly_range = [int(i) for i in opt["svc_poly_range"].split(",") if i]
        allkwargs = dict(C=C_range,
                         gamma=gamma_range,
                         kernel=kernel_range,
                         degree=poly_range)
        kwargs = {}
        for k in allkwargs:
            if allkwargs[k]:
                kwargs[k] = allkwargs[k]
        msgr.message("Exploring the SVC domain.")
        grid = explore_SVC(Xbt,
                           Ybt,
                           n_folds=5,
                           n_jobs=int(opt["svc_n_jobs"]),
                           **kwargs)
        import pickle

        krnlstr = "_".join(s for s in opt["svc_kernel_range"].split(",") if s)
        pkl = open("grid%s.pkl" % krnlstr, "w")
        pickle.dump(grid, pkl)
        pkl.close()
        #        pkl = open('grid.pkl', 'r')
        #        grid = pickle.load(pkl)
        #        pkl.close()
        plot_grid(grid, save=opt["svc_img"])

    # test the accuracy of different classifiers
    if flg["t"]:
        # test different classifiers
        msgr.message("Exploring different classifiers.")
        msgr.message("cls_id   cls_name          mean     max     min     std")

        res = explorer_clsfiers(
            classifiers,
            Xt,
            Yt,
            labels=labels,
            indexes=indexes,
            n_folds=5,
            bv=flg["v"],
            extra=flg["x"],
        )
        # TODO: sort(order=...) is working only in the terminal, why?
        # res.sort(order='mean')
        with open(opt["csv_test_cls"], "w") as csv:
            csv.write(tocsv(res))

    if flg["c"]:
        # classify
        data = np.load(opt["npy_data"])
        indx = np.load(opt["npy_index"])

        # Substitute using column values
        data, dummy = substitute(data, rules, cols[1:])
        Xt = data[indx]

        if scaler:
            msgr.message("Scaling the training data set.")
            scaler.fit(Xt, Yt)
            Xt = scaler.transform(Xt)
            msgr.message("Scaling the whole data set.")
            data = scaler.transform(data)
        if decmp:
            msgr.message("Decomposing the training data set.")
            decmp.fit(Xt)
            Xt = decmp.transform(Xt)
            msgr.message("Decompose the whole data set.")
            data = decmp.transform(data)
        cats = np.load(opt["npy_cats"])

        np.save("data_filled_scaled.npy", data)
        tcols = []
        for cls in classifiers:
            report = (open(opt["report_class"], "w")
                      if opt["report_class"] else sys.stdout)
            run_classifier(cls, Xt, Yt, Xt, Yt, labels, data, report=report)
            tcols.append((cls["name"], "INTEGER"))

        import pickle

        with open("classification_results.pkl", "w") as res:
            pickle.dump(classifiers, res)
        # classifiers = pickle.load(res)
        msgr.message("Export the results to layer: <%s>" % str(rlayer))
        export_results(
            vect,
            classifiers,
            cats,
            rlayer,
            vtraining,
            tcols,
            overwrite(),
            pkl="res.pkl",
            append=flg["a"],
        )
    #        res.close()

    if flg["r"]:
        rules = ("\n".join([
            "%d %s" % (k, v) for k, v in get_colors(vtraining).items()
        ]) if vtraining else None)

        msgr.message("Export the layer with results to raster")
        with Vector(vect, mode="r") as vct:
            tab = vct.dblinks.by_name(rlayer).table()
            rasters = [c for c in tab.columns]
            rasters.remove(tab.key)

        v2rst = Module("v.to.rast")
        rclrs = Module("r.colors")
        for rst in rasters:
            v2rst(
                input=vect,
                layer=rlayer,
                type="area",
                use="attr",
                attrcolumn=rst.encode(),
                output=(opt["rst_names"] % rst).encode(),
                memory=1000,
                overwrite=overwrite(),
            )
            if rules:
                rclrs(map=rst.encode(), rules="-", stdin_=rules)