예제 #1
0
파일: operators.py 프로젝트: tcpavel/tweeql
 def __init__(self, child, groupby, aggregates, window):
     QueryOperator.__init__(self)
     self.child = child
     self.can_query_stream_cache = self.can_query_stream_impl()
     self.groupby = groupby
     self.aggregates = aggregates
     self.window = window
     self.aggregator = Aggregator(self.aggregates, self.groupby,
                                  self.window)
예제 #2
0
파일: operators.py 프로젝트: bh0085/tweeql
 def __init__(self, child, groupby, aggregates, window):
     QueryOperator.__init__(self)
     self.child = child
     self.can_query_stream_cache = self.can_query_stream_impl()
     self.groupby = groupby
     self.aggregates = aggregates
     self.window = window
     self.aggregator = Aggregator(self.aggregates, self.groupby, self.window)
예제 #3
0
파일: operators.py 프로젝트: tcpavel/tweeql
class GroupBy(QueryOperator):
    """
        Groups results from child by some set of fields, and runs the aggregate 
        function(s) over them, emitting results and resetting buckets every
        window seconds.
    """
    def __init__(self, child, groupby, aggregates, window):
        QueryOperator.__init__(self)
        self.child = child
        self.can_query_stream_cache = self.can_query_stream_impl()
        self.groupby = groupby
        self.aggregates = aggregates
        self.window = window
        self.aggregator = Aggregator(self.aggregates, self.groupby,
                                     self.window)

    def filter(self, updates, return_passes, return_fails):
        if return_passes:
            (passes, fails) = self.child.filter(updates, return_passes,
                                                return_fails)
            new_emissions = []
            new_emissions.extend(self.aggregator.update(passes))
            return (new_emissions, None)
        else:
            return (None, None)

    def filter_params(self):
        return self.child.filter_params()

    def can_query_stream(self):
        return self.can_query_stream_cache

    def can_query_stream_impl(self):
        return self.child.can_query_stream()

    def assign_descriptor(self, tuple_descriptor):
        self.tuple_descriptor = tuple_descriptor
        self.aggregator.tuple_descriptor = tuple_descriptor
        with_aggregates = self.groupby.duplicate()
        for aggregate in self.aggregates:
            with_aggregates.add_descriptor(aggregate)
        with_aggregates.add_descriptor(TwitterFields.created_field)
        for alias, fd in tuple_descriptor.descriptors.items():
            with_aggregates.add_descriptor(fd)
        self.child.assign_descriptor(with_aggregates)
예제 #4
0
파일: operators.py 프로젝트: bh0085/tweeql
class GroupBy(QueryOperator):
    """
        Groups results from child by some set of fields, and runs the aggregate 
        function(s) over them, emitting results and resetting buckets every
        window seconds.
    """
    def __init__(self, child, groupby, aggregates, window):
        QueryOperator.__init__(self)
        self.child = child
        self.can_query_stream_cache = self.can_query_stream_impl()
        self.groupby = groupby
        self.aggregates = aggregates
        self.window = window
        self.aggregator = Aggregator(self.aggregates, self.groupby, self.window)
    def filter(self, updates, return_passes, return_fails):
        if return_passes:
            (passes, fails) = self.child.filter(updates, return_passes, return_fails)
            new_emissions = []
            new_emissions.extend(self.aggregator.update(passes))
            return (new_emissions, None)
        else:
            return (None, None)
    def filter_params(self):
        return self.child.filter_params()
    def can_query_stream(self):
        return self.can_query_stream_cache
    def can_query_stream_impl(self):
        return self.child.can_query_stream()
    def assign_descriptor(self, tuple_descriptor):
        self.tuple_descriptor = tuple_descriptor
        self.aggregator.tuple_descriptor = tuple_descriptor
        with_aggregates = self.groupby.duplicate()
        for aggregate in self.aggregates:
            with_aggregates.add_descriptor(aggregate)
        with_aggregates.add_descriptor(TwitterFields.created_field)
        for alias, fd in tuple_descriptor.descriptors.items():
            with_aggregates.add_descriptor(fd)
        self.child.assign_descriptor(with_aggregates)
예제 #5
0
def launch_test(test_params):
    """
    This function instanciates the base classifiers and the aggregators. It 
    dispatches the computation on several cores of the micro-processor (if any).
    It also takes care of experimental data saving, figure edition and so on.
    """
    ##########################################
    # PARAMETERS BELOW SHOULD NOT BE CHANGED #
    ##########################################
    rate_resolution = 0.01
    if (test_params["dataset"] in ['moons', 'circles', 'blobs', 'neoblobs']):
        n = test_params["n"]
        if 'beta' in test_params.keys():
            mybeta = test_params["beta"]
        else:
            mybeta = 0.5
        Data = synthDataset(test_params["dataset"], n, beta=mybeta)
        synth_data = True
    else:
        raise ValueError("Unknown dataset name.")
    n_class = Data.n_class
    Ns = Data.Ns
    n_adver = 0
    if "adversary" in test_params.keys():
        for i in range(len(test_params["adversary"])):
            n_adver += test_params["adversary"][i][1]
    n_bad = 0
    if "noisy" in test_params.keys():
        for i in range(len(test_params["noisy"])):
            n_bad += test_params["noisy"][i][1]
    if 'names' in test_params.keys():
        names = test_params["names"]
    else:
        names = []
        for i in range(Ns):
            names.append("Tree")

    if (test_params["tnorm"] in ['Aczel-Alsina']):
        lambda_range = np.logspace(0, 1, 101)
        lambda_default = 5.0
    elif (test_params["tnorm"] in ['convex']):
        lambda_range = np.linspace(0.0, 1.0, 101)
        lambda_default = 0.5
    else:
        raise ValueError('Unknown tnorm name.')

    if (test_params["tnorm_ada"] in ['convex']):
        lambda_range_ada = np.linspace(0.0, 1.0, 101)
        lambda_default_ada = 0.5
    elif (test_params["tnorm_ada"] in ['Aczel-Alsina']):
        lambda_range_ada = np.logspace(0, 1, 101)
        lambda_default_ada = 1.0
    else:
        raise ValueError('Unknown tnorm name.')
    r_range = np.logspace(-1, 2, 201)
    alpha_range = np.linspace(0, 1, 101)
    regul_range = np.logspace(-2, 2, 101)
    rho_range = np.logspace(-2, 2, 101)

    # Models
    clf = []
    predictors = []
    for i in range(Ns):
        if names[i] == 'Tree':
            clf.append(DecisionTreeClassifier(max_depth=2))
        if names[i] == 'Reg Log':
            clf.append(LogisticRegression(penalty='l2', C=1.0, solver='lbfgs'))
        if names[i] == 'NBC':
            clf.append(GaussianNB())
        if names[i] == 'QDA':
            clf.append(QuadraticDiscriminantAnalysis())
        if names[i] == 'SVM_lin':
            clf.append(SVC(kernel="linear", C=0.025))
        if names[i] == 'SVM_nlin':
            clf.append(SVC(gamma=2, C=1))
        if names[i] == 'GP':
            clf.append(GaussianProcessClassifier(1.0 * RBF(1.0)))
        if names[i] == 'RF':
            clf.append(
                RandomForestClassifier(max_depth=5,
                                       n_estimators=10,
                                       max_features=1))
        if names[i] == 'MLP':
            clf.append(MLPClassifier(alpha=1))
        if names[i] == 'Ada':
            clf.append(AdaBoostClassifier())
        if names[i] == 'kNN':
            clf.append(KNeighborsClassifier(n_neighbors=5))
        predictors.append(
            Predictor('Base Clf ' + str(i + 1), 'base_clf', clf[i],
                      rate_resolution, 'k'))

    adv = []
    for i in range(len(test_params["adversary"])):
        for j in range(test_params["adversary"][i][1]):
            adv.append(
                Adversary(clf[test_params["adversary"][i][0]], n_class,
                          test_params["adversary"][i][2]))
            predictors.append(
                Predictor('Advers. Clf ' + str(i + 1), 'base_clf', adv[i],
                          rate_resolution, 'k'))
            Ns += 1

    bad = []
    for i in range(len(test_params["noisy"])):
        for j in range(test_params["noisy"][i][1]):
            bad.append(
                NoisyClf(clf[test_params["noisy"][i][0]], n_class,
                         test_params["noisy"][i][2]))
            predictors.append(
                Predictor('Noisy Clf ' + str(i + 1), 'base_clf', bad[i],
                          rate_resolution, 'k'))
            Ns += 1

    for i in range(test_params["clone"]):
        predictors.append(predictors[0].clone())
        predictors[-1].name = 'Clone ' + str(i + 1)
        clf.append(predictors[-1].machine)
        Ns += 1

    for i in range(test_params["random"]):
        clf.append(DummyClassifier(strategy='uniform'))
        predictors.append(
            Predictor('Random Clf ' + str(i + 1), 'base_clf', clf[-1],
                      rate_resolution, 'k'))
        Ns += 1

    for i in range(test_params["dummy"]):
        clf.append(DummyClassifier(strategy='most_frequent'))
        predictors.append(
            Predictor('Constant Clf ' + str(i + 1), 'base_clf', clf[-1],
                      rate_resolution, 'k'))
        Ns += 1

    selec = Aggregator('selection', Ns, n_class)
    predictors.append(
        Predictor('Selected Clf.', 'agg', selec, rate_resolution, 'b'))

    wvote = Aggregator('weighted_vote',
                       Ns,
                       n_class,
                       params={
                           "r_range": r_range,
                           "expo": False
                       })
    predictors.append(
        Predictor('Weighted Vote Ens.', 'agg', wvote, rate_resolution,
                  'orange'))

    expow = Aggregator('weighted_vote',
                       Ns,
                       n_class,
                       params={"r_range": r_range})
    predictors.append(
        Predictor('Expo. Weighted Vote Ens.', 'agg', expow, rate_resolution,
                  'brown'))

    naive = Aggregator('naive', Ns, n_class, params={"method": 'indep'})
    predictors.append(
        Predictor('Naive Bayes.', 'agg', naive, rate_resolution, '--m'))

    spocc = Aggregator(
        'spocc', Ns, n_class, {
            "tnorm": test_params["tnorm"],
            "hyper": lambda_default,
            "hyper_range": lambda_range
        })
    predictors.append(
        Predictor('SPOCC (' + test_params["tnorm"] + ')', 'agg', spocc,
                  rate_resolution, 'm'))

    adaspocc = Aggregator(
        'adaspocc', Ns, n_class, {
            "tnorm": test_params["tnorm_ada"],
            "hyper": lambda_default_ada,
            "hyper_range": lambda_range_ada,
            "alpha_range": alpha_range,
            "rho": 1,
            "rho_range": rho_range
        })
    predictors.append(
        Predictor('adaSPOCC (' + test_params["tnorm_ada"] + ')', 'agg',
                  adaspocc, rate_resolution, 'm'))

    stack = Aggregator('stacked_logreg', Ns, n_class,
                       {"regul_range": regul_range})
    predictors.append(
        Predictor('Stacked Log. Reg.', 'agg', stack, rate_resolution, 'r'))

    clf_ctl = LogisticRegression(penalty='l2', C=1.0)
    predictors.append(
        Predictor('Centralized Clf.', 'global', clf_ctl, rate_resolution,
                  '--g'))

    if (test_params["dataset"] not in ['20newsgroup', 'mnist', 'drive']):
        bayes = Aggregator('bayes', Ns, n_class)
        predictors.append(
            Predictor('Bayes Agg.', 'agg', bayes, rate_resolution, ':g'))

    if synth_data:
        optim = Oracle(test_params["dataset"])
        predictors.append(
            Predictor('Optimal Clf.', 'oracle', optim, rate_resolution, 'g'))

    main_loop_partial = partial(main_loop,
                                n=n,
                                Data=Data,
                                mode=test_params["mode"],
                                Ns=Ns,
                                predictors=predictors,
                                sent=test_params["sent"],
                                iter_max=test_params["iter_max"])
    results = main_loop_partial(test_params["rand_state"])
    accuracy, conf_matrix, failed_loc = results
    for i in range(len(predictors)):
        print(predictors[i].name + " has accuracy " + str(accuracy[i]))