Python Bunch.predicted Examples

Programming Language: Python

Namespace/Package Name: sklearn.datasets.base

Class/Type: Bunch

Method/Function: predicted

Examples at hotexamples.com: 5

Python Bunch.predicted - 5 examples found. These are the top rated real world Python examples of sklearn.datasets.base.Bunch.predicted extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Bunch(30)

data(22)

label(6)

remaining(5)

predicted(5)

sentence(3)

oracle(3)

contents(3)

fixk(3)

labels(3)

X(2)

entities(2)

keys(2)

kwords(2)

filenames(2)

text(2)

meta(2)

lable(1)

masker(1)

hyperparams(1)

offset(1)

groups(1)

mask(1)

fig(1)

func(1)

condition_mask(1)

accu(1)

ax(1)

bow(1)

category_labels(1)

clfreg(1)

cmap(1)

content(1)

feature_names(1)

contents_seq(1)

contents_seq_pad(1)

coordinate_names(1)

coordinates(1)

data_fn(1)

data_test(1)

description(1)

zmaps(1)

Example #1

Show file

File: unckcheatv2.py Project: mramire8/active

def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
    vct_analizer = vct.build_tokenizer()
    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = max(100, args.fixk)

    fixk_saved = "{0}{1}.p".format(args.train, args.fixk)

    try:
        fixk_file = open(fixk_saved, "rb")
        data = pickle.load(fixk_file)
    except IOError:
        data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5)
        fixk_file = open(fixk_saved, "wb")
        pickle.dump(data, fixk_file)

    # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    parameters = parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__


    #### STUDENT CLASSIFIER
    clf = linear_model.LogisticRegression(penalty="l1", C=1)
    print "\nStudent Classifier: %s" % clf

    #### EXPERT CLASSIFIER

    exp_clf = linear_model.LogisticRegression(penalty='l1', C=.3)
    exp_clf.fit(data.test.bow, data.test.target)
    expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                         cost_function=cost_model.cost_function)
    print "\nExpert: %s " % expert

    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          min_size))
    print ("Cheating experiment - use full uncertainty query k words")
    t0 = time.time()
    ### experiment starts
    tx =[]
    tac = []
    tau = []
    for t in range(args.trials):
        trial_accu =[]

        trial_aucs = []

        trial_x_axis = []
        print "*" * 60
        print "Trial: %s" % t

        student = randomsampling.UncertaintyLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t)
        print "\nStudent: %s " % student
        train_indices = []
        train_x = []
        train_y = []
        pool = Bunch()
        pool.data = data.train.bow.tocsr()   # full words, for training
        pool.fixk = data.train.bowk.tocsr()  # k words BOW for querying
        pool.target = data.train.target
        pool.predicted = []
        pool.kwords = np.array(data.train.kwords)  # k words
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool

        bootstrapped = False

        current_cost = 0
        iteration = 0
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:

            if not bootstrapped:
                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:
                query_index = student.pick_next(pool=pool, k=step_size)

            query = pool.fixk[query_index]  # query with k words

            query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]]

            ground_truth = pool.target[query_index]
            #labels, spent = expert.label(unlabeled=query, target=ground_truth)
            if iteration == 0: ## bootstrap uses ground truth
                labels = ground_truth
                spent = [0] * len(ground_truth) ## bootstrap cost is ignored
            else:
                labels = expert.label_instances(query, ground_truth)
                spent = expert.estimate_instances(query_size)


            ## add data recent acquired to train
            ## CHANGE: if label is not useful, ignore and do not charge money for it
            useful_answers = np.array([[x, y, z] for x, y, z in zip(query_index, labels, spent) if y is not None])

            # train_indices.extend(query_index)
            if useful_answers.shape[0] != 0:
                train_indices.extend(useful_answers[:, 0])

                # add labels to training
                train_x = pool.data[train_indices]  ## train with all the words

                # update labels with the expert labels
                train_y.extend(useful_answers[:, 1])

                #count for cost
                ### accumulate the cost of the query
                # query_cost = np.array(spent).sum()
                # current_cost += query_cost
                query_cost = useful_answers[:, 2]
                query_cost = np.sum(query_cost)
                current_cost += query_cost

            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # retrain the model
            current_model = student.train(train_x, train_y)

            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices),
                                                                                            accu,
                                                                                            auc, query_cost,
                                                                                            current_cost, spent))

            ## the results should be based on the cost of the labeling
            if iteration > 0:   # bootstrap iteration

                student.budget -= query_cost ## Bootstrap doesn't count

                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)

                ## partial trial results

                trial_accu.append([x_axis_range, accu])
                trial_aucs.append([x_axis_range, auc])
            iteration += 1

        # end of budget loop

        tac.append(trial_accu)
        tau.append(trial_aucs)
    #end trial loop

    accuracies = extrapolate_trials(tac)
    aucs = extrapolate_trials(tau)

    print("Elapsed time %.3f" % (time.time() - t0))
    print_extrapolated_results(accuracies, aucs)

Example #2

Show file

File: sent_unc.py Project: mramire8/active

def main():
    print args
    print

    accuracies = defaultdict(lambda: [])

    ora_accu = defaultdict(lambda: [])

    oracle_accuracies =[]
    ora_cm = defaultdict(lambda: [])
    lbl_dit = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())

    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = 10

    args.fixk = None

    data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    parameters = experiment_utils.parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = experiment_utils.set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__

    ### SENTENCE TRANSFORMATION
    if args.train == "twitter":
        sent_detector = TwitterSentenceTokenizer()
    else:
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    ## delete <br> to "." to recognize as end of sentence
    data.train.data = experiment_utils.clean_html(data.train.data)
    data.test.data = experiment_utils.clean_html(data.test.data)

    print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
    ## Get the features of the sentence dataset

    ## create splits of data: pool, test, oracle, sentences
    expert_data = Bunch()
    if not args.fulloracle:
        train_test_data = Bunch()

        expert_data.sentence, train_test_data.pool = split_data(data.train)
        expert_data.oracle, train_test_data.test = split_data(data.test)

        data.train.data = train_test_data.pool.train.data
        data.train.target = train_test_data.pool.train.target

        data.test.data = train_test_data.test.train.data
        data.test.target = train_test_data.test.train.target

    ## convert document to matrix
    data.train.bow = vct.fit_transform(data.train.data)
    data.test.bow = vct.transform(data.test.data)

    #### EXPERT CLASSIFIER: ORACLE
    print("Training Oracle expert")
    exp_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty)

    if not args.fulloracle:
        print "Training expert documents:%s" % len(expert_data.oracle.train.data)
        labels, sent_train = experiment_utils.split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=args.limit)

        expert_data.oracle.train.data = sent_train
        expert_data.oracle.train.target = np.array(labels)
        expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)

        exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
    else:
        # expert_data.data = np.concatenate((data.train.data, data.test.data))
        # expert_data.target = np.concatenate((data.train.target, data.test.target))
        expert_data.data =data.train.data
        expert_data.target = data.train.target
        expert_data.target_names = data.train.target_names
        labels, sent_train = experiment_utils.split_data_sentences(expert_data, sent_detector, vct, limit=args.limit)
        expert_data.bow = vct.transform(sent_train)
        expert_data.target = labels
        expert_data.data = sent_train
        exp_clf.fit(expert_data.bow, expert_data.target)

    if "neutral" in args.expert:
        expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                             cost_function=cost_model.cost_function)
    elif "true" in args.expert:
        expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function)
    elif "pred" in args.expert:
        expert = baseexpert.PredictingExpert(exp_clf,  #threshold=args.neutral_threshold,
                                             cost_function=cost_model.cost_function)
    elif "human" in args.expert:
        expert = baseexpert.HumanExpert(", ".join(["{}={}".format(a,b) for a,b in enumerate(data.train.target_names)])+"? > ")
    else:
        raise Exception("We need an expert!")

    print "\nExpert: %s " % expert

    #### EXPERT CLASSIFIER: SENTENCES
    print("Training sentence expert")
    sent_clf = None
    if args.cheating:
        labels, sent_train = experiment_utils.split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=args.limit)

        expert_data.sentence.train.data = sent_train
        expert_data.sentence.train.target = np.array(labels)
        expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)
        sent_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty)
        sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)

    #### STUDENT CLASSIFIER
    clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty)

    print "\nStudent Classifier: %s" % clf
    print "\nSentence Classifier: %s" % sent_clf
    print "\nExpert Oracle Classifier: %s" % exp_clf
    print "\nPenalty Oracle:", exp_clf.C
    print "\nVectorizer: %s" % vct
    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          min_size))
    print ("Anytime active learning experiment - use objective function to pick data")
    t0 = time.time()
    tac = []
    tau = []
    ### experiment starts
    for t in range(args.trials):
        trial_accu = []

        trial_aucs = []

        print "*" * 60
        print "Trial: %s" % t

        student = get_student(clf, cost_model, sent_clf, sent_detector, vct)
        student.human_mode = args.expert == 'human'

        print "\nStudent: %s " % student

        train_indices = []
        neutral_data = []  # save the xik vectors
        train_x = []
        train_y = []
        neu_x = []  # data to train the classifier
        neu_y = np.array([])

        pool = Bunch()
        pool.data = data.train.bow.tocsr()  # full words, for training
        pool.text = data.train.data
        pool.target = data.train.target
        pool.predicted = []
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool

        bootstrapped = False
        current_cost = 0
        iteration = 0
        query_index = None
        query_size = None
        oracle_answers = 0
        calibrated=args.calibrate
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:
            util = []

            if not bootstrapped:
                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                query = pool.data[query_index]
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:

                chosen = student.pick_next(pool=pool, step_size=step_size)

                query_index = [x for x, y in chosen]  # document id of chosen instances
                query = [y[0] for x, y in chosen]  # sentence of the document

                query_size = [1] * len(query_index)

            ground_truth = pool.target[query_index]

            if iteration == 0:  ## bootstrap uses ground truth
                labels = ground_truth
                spent = [0] * len(ground_truth)  ## bootstrap cost is ignored
            else:
                # print "ask labels"
                labels = expert.label_instances(query, ground_truth)
                spent = expert.estimate_instances(query_size)

            ### accumulate the cost of the query
            query_cost = np.array(spent).sum()
            current_cost += query_cost

            useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None])

            neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \
                if iteration != 0 else np.array([])

            ## add data recent acquired to train
            if useful_answers.shape[0] != 0:
                train_indices.extend(useful_answers[:, 0])

                # add labels to training
                train_x = pool.data[train_indices]  # # train with all the words

                # update labels with the expert labels
                train_y.extend(useful_answers[:, 1])

            neu_x, neu_y, neutral_data = update_sentence(neutral_data, neu_x, neu_y, labels, query_index, pool, vct)
            # neu_x, neu_y, neutral_data = update_sentence_query(neutral_data, neu_x, neu_y, query, labels)

            if neu_y.shape[0] != neu_x.shape[0]:
                raise Exception("Training data corrupted!")
            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # retrain the model
            current_model = student.train_all(train_x, train_y, neu_x, neu_y)

            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            correct_labels = (np.array(ground_truth) == np.array(labels).reshape(len(labels))).sum()

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tGT:{5}\tneu:{6}\t{7}\tND:{8}\tTD:{9}\t ora_accu:{10}".format(
                len(train_indices),
                accu,
                auc, query_cost,
                current_cost,
                ground_truth,
                len(neutral_answers), neu_y.shape[0], neu_y.sum(), np.array(train_y).sum(), correct_labels))

            ## the results should be based on the cost of the labeling
            if iteration > 0:  # bootstrap iteration

                student.budget -= query_cost  ## Bootstrap doesn't count
                # oracle accuracy (from queries)
                oracle_answers += correct_labels
                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)
                ora_accu[x_axis_range].append(1. * correct_labels)
                ora_cm[x_axis_range].append(metrics.confusion_matrix(ground_truth, labels, labels=np.unique(train_y)))
                lbl_dit[x_axis_range].append(np.sum(train_y))
                # partial trial results
                trial_accu.append([x_axis_range, accu])
                trial_aucs.append([x_axis_range, auc])
                # oracle_accuracies[x_axis_range].append(oracle_answers)
            iteration += 1
            # end of budget loop

        tac.append(trial_accu)
        tau.append(trial_aucs)
        oracle_accuracies.append(1.*oracle_answers / (len(train_indices)-bootstrap_size))
        print "Trial: {}, Oracle right answers: {}, Iteration: {}, Labels:{}, ACCU-OR:{}".format(t, oracle_answers,
                 iteration, len(train_indices)-bootstrap_size,1.*oracle_answers / (len(train_indices)-bootstrap_size))
        #end trial loop
    if args.cost_function not in "uniform":
        accuracies = experiment_utils.extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size)
        aucs = experiment_utils.extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size)
    print "\nAverage oracle accuracy: ", np.array(oracle_accuracies).mean()
    print("Elapsed time %.3f" % (time.time() - t0))
    cheating = "CHEATING" if args.cheating else "NOCHEAT"
    experiment_utils.print_extrapolated_results(accuracies, aucs, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student)
    experiment_utils.oracle_accuracy(ora_accu, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student, cm=ora_cm, num_trials=args.trials)

Example #3

Show file

File: traintest.py Project: mramire8/active

def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
    vct_analizer = vct.build_tokenizer()
    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = max(50, args.fixk)

    if "imdb" in args.train:
        ########## IMDB MOVIE REVIEWS ###########
        data = load_imdb(args.train, shuffle=True, rnd=2356, vct=vct, min_size=min_size,
                               fix_k=args.fixk)  # should brind data as is
    elif "aviation" in args.train:
        raise Exception("We are not ready for that data yet")
    elif "20news" in args.train:
        ########## 20 news groups ######
        data = load_20newsgroups(categories=categories[0], vectorizer=vct, min_size=min_size,
                                       fix_k=args.fixk)  # for testing purposes
    elif "dummy" in args.train:
        ########## DUMMY DATA###########
        data = load_dummy("C:/Users/mramire8/Documents/code/python/data/dummy", shuffle=True,
                                rnd=2356, vct=vct, min_size=0, fix_k=args.fixk)
    else:
        raise Exception("We do not know that dataset")

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))
    #print(data.train.data[0])
    #### COST MODEL
    parameters = parse_parameters(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(parameters)

    print "\nCost Model: %s" % cost_model.__class__.__name__

    #### ACCURACY MODEL
    # try:
    # #     accu_parameters = parse_parameters(args.accu_model)
    # except ValueError:
    accu_parameters = parse_parameters_mat(args.accu_model)
    # else
    #     print("Error: Accuracy parameters didn't work")

    print "Accuracy Parameters %s" % accu_parameters
    #if "fixed" in args.accu_function:
    #    accuracy_model = base_models.FixedAccuracyModel(accuracy_value=.7)
    #elif "log" in args.accu_function:
    #    accuracy_model = base_models.LogAccuracyModel(model=parameters)
    #elif "linear" in args.accu_function:
    #    accuracy_model = base_models.LRAccuracyModel(model=parameters)
    #else:
    #    raise Exception("We need a defined cost function options [fixed|log|linear]")
    #
    #print "\nAccuracy Model: %s " % accuracy_model

    #### CLASSIFIER
    #### Informed priors
    #feature_counts = np.ones(x_train.shape[0]) * x_train
    #feature_frequencies = feature_counts / np.sum(feature_counts)
    #alpha = feature_frequencies
    alpha = 1
    clf = MultinomialNB(alpha=alpha)
    print "\nClassifier: %s" % clf

    #### EXPERT MODEL
    #expert = baseexpert.BaseExpert()
    if "fixed" in args.expert:
        expert = baseexpert.FixedAccuracyExpert(accuracy_value=accu_parameters[0],
                                                cost_function=cost_model.cost_function)  #average value of accuracy of the experts
    elif "true" in args.expert:
        expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function)
    elif "linear" in args.expert:
        #expert = baseexpert.LRFunctionExpert(model=[0.0019, 0.6363],cost_function=cost_model.cost_function)
        raise Exception("We do not know linear yet!!")
    elif "log" in args.expert:
        expert = baseexpert.LogFunctionExpert(model=accu_parameters, cost_function=cost_model.cost_function)
    elif "direct" in args.expert:
        expert = baseexpert.LookUpExpert(accuracy_value=accu_parameters, cost_function=cost_model.cost_function)
    else:
        raise Exception("We need a defined cost function options [fixed|log|linear]")
        #expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function)
    print "\nExpert: %s " % expert

    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200
    eval_range = 1 if (args.budget / evaluation_points) <= 0 else args.budget / evaluation_points
    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          50))

    t0 = time.time()
    ### experiment starts
    for t in range(args.trials):
        print "*" * 60
        print "Trial: %s" % t
        # TODO shuffle the data??
        #student = baselearner.BaseLearner(model=clf, cost_model=cost_model, accuracy_model=accuracy_model, budget=args.budget,
        #                                  seed=t)
        student = randomsampling.RandomSamplingLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t)
        print "\nStudent: %s " % student
        train_indices = []
        train_x = []
        train_y = []
        pool = Bunch()
        pool.data = data.train.bow.tocsr()   # full words, for training
        pool.fixk = data.train.bowk.tocsr()  # k words BOW for querying
        pool.target = data.train.target
        pool.predicted = []
        pool.kwords = np.array(data.train.kwords)  # k words
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool

        #for x in pool.fixk:
        #    print x.todense().sum()

        bootstrapped = False

        current_cost = 0
        iteration = 0
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:

            if not bootstrapped:
                ## random bootstrap
                #bt = randomsampling.BootstrapRandom(random_state=t * 10)

                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:
                query_index = student.pick_next(pool=pool, k=step_size)

            query = pool.fixk[query_index]  # query with k words

            query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]]

            #if query_size[0] >50:
            #    print "*** %s" % pool.kwords[query_index]

            ground_truth = pool.target[query_index]
            #labels, spent = expert.label(unlabeled=query, target=ground_truth)
            if iteration == 0: ## bootstrap uses ground truth
                labels = ground_truth
            else:
                #labels = expert.label_instances(query, ground_truth)
                labels = expert.label_instances(query_size, ground_truth)
                #spent = expert.estimate_instances(pool.kwords[query_index])
            spent = expert.estimate_instances(query_size)

            query_cost = np.array(spent).sum()
            current_cost += query_cost

            train_indices.extend(query_index)

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # add labels to training
            train_x = pool.data[train_indices]  ## train with all the words

            # update labels with the expert labels
            #train_y = pool.target[train_indices]
            train_y.extend(labels)
            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # retrain the model
            current_model = student.train(train_x, train_y)
            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            #auc = metrics.roc_auc_score(data.test.target, y_probas[:,1])
            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print (
            "TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices), accu,
                                                                                              auc, query_cost,
                                                                                              current_cost, spent))

            ## the results should be based on the cost of the labeling
            if iteration > 0: # bootstrap iteration

                student.budget -= query_cost ## Bootstrap doesn't count

                #x_axis_range = int(current_cost / eval_range)
                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                #accuracies[len(train_indices)].append(accu)
                #aucs[len(train_indices)].append(auc)
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)
            iteration += 1
    print("Elapsed time %.3f" % (time() - t0))
    print_results(x_axis, accuracies, aucs)

Example #4

Show file

File: anytime.py Project: mramire8/active

def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
    vct_analizer = vct.build_tokenizer()

    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = max(100, args.fixk)

    if args.fixk < 0:
        args.fixk = None

    fixk_saved = "{0}{1}.p".format(args.train, args.fixk)

    try:
        print "Loading existing file... %s " % args.train
        fixk_file = open(fixk_saved, "rb")
        data = pickle.load(fixk_file)
        fixk_file.close()
        vectorizer = open("{0}vectorizer.p".format(args.train), "rb")
        vct = pickle.load(vectorizer)
        vectorizer.close()
    except (IOError, ValueError):
        print "Loading from scratch..."
        data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5)
        fixk_file = open(fixk_saved, "wb")
        pickle.dump(data, fixk_file)
        fixk_file.close()
        vectorizer = open("{0}vectorizer.p".format(args.train), "wb")
        pickle.dump(vct, vectorizer)
        vectorizer.close()

    # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    parameters = parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__

    #### STUDENT CLASSIFIER
    clf = linear_model.LogisticRegression(penalty="l1", C=1)
    # clf = set_classifier(args.classifier)
    print "\nStudent Classifier: %s" % clf

    #### EXPERT CLASSIFIER

    exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    exp_clf.fit(data.test.bow, data.test.target)
    expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                         cost_function=cost_model.cost_function)
    print "\nExpert: %s " % expert

    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          min_size))
    print ("Anytime active learning experiment - use objective function to pick data")
    t0 = time.time()
    tac = []
    tau = []
    ### experiment starts
    for t in range(args.trials):
        trial_accu = []

        trial_aucs = []

        print "*" * 60
        print "Trial: %s" % t
        if args.student in "anyunc":
            student = randomsampling.AnytimeLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
                                                    subpool=250, cost_model=cost_model)
        elif args.student in "lambda":
            student = randomsampling.AnytimeLearnerDiff(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
                                                    subpool=250, cost_model=cost_model, lambda_value=args.lambda_value)
        elif args.student in "anyzero":
            student = randomsampling.AnytimeLearnerZeroUtility(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
                                                    subpool=250, cost_model=cost_model)
        else:
            raise ValueError("Oops! We do not know that anytime strategy. Try again.")

        print "\nStudent: %s " % student
        train_indices = []
        neutral_text = []  # save the raw text of the queries
        neutral_data = []  # save the xik vectors
        train_x = []
        train_y = []
        neu_x = [] # data to train the classifier
        neu_y = np.array([])

        pool = Bunch()
        pool.data = data.train.bow.tocsr()   # full words, for training
        pool.text = data.train.data
        # pool.fixk = data.train.bowk.tocsr()  # k words BOW for querying
        pool.target = data.train.target
        pool.predicted = []
        # pool.kwords = np.array(data.train.kwords)  # k words
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool

        bootstrapped = False

        current_cost = 0
        iteration = 0
        query_index = None
        query_size = None
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:
            util = []
            if not bootstrapped:
                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                query = pool.data[query_index]
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:
                # print "pick instance"

                ## chose returns: index, k
                ## util returns: utility, k, unc
                query_chosen, util = student.pick_next(pool=pool, step_size=step_size)
                query_index = [a for a, b in query_chosen]
                query_size = [b for a, b in query_chosen]

                # query = pool.fixk[query_index]  # query with k words
                qk = []
                for q, k in query_chosen:
                    qk.append(" ".join(vct_analizer(pool.text[q])[0:int(k)]))
                query = vct.transform(qk)

            # query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]]

            ground_truth = pool.target[query_index]
            #labels, spent = expert.label(unlabeled=query, target=ground_truth)
            if iteration == 0: ## bootstrap uses ground truth
                labels = ground_truth
                spent = [0] * len(ground_truth) ## bootstrap cost is ignored
            else:
                # print "ask labels"
                labels = expert.label_instances(query, ground_truth)
                spent = expert.estimate_instances(query_size)

            ### accumulate the cost of the query
            query_cost = np.array(spent).sum()
            current_cost += query_cost
            # print query_index
            useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None])
            neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \
                if iteration != 0 else np.array([])

            # print labels
            # print "label\tutility\tk\tunc"
            # print format_query(zip(labels, util))

            ## add data recent acquired to train
            if useful_answers.shape[0] != 0:
                # print "get training"
                # train_indices.extend(query_index)
                train_indices.extend(useful_answers[:, 0])

                # add labels to training
                train_x = pool.data[train_indices]  # # train with all the words

                # update labels with the expert labels
                #train_y = pool.target[train_indices]
                train_y.extend(useful_answers[:, 1])

            if neutral_answers.shape[0] != 0:
                # current query neutrals
                qlbl = []

                for xik, lbl in zip(query, labels):
                    # neutral_data.append(xik)
                    if isinstance(neutral_data, list):
                        neutral_data = xik
                    else:
                        neutral_data = vstack([neutral_data, xik], format='csr')
                    qlbl.append(neutral_label(lbl))

                ## append the labels of the current query
                neu_y = np.append(neu_y, qlbl)
                neu_x = neutral_data
                #end usefulanswers


            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # retrain the model
            # current_model = student.train(train_x, train_y)
            # print "train models"
            current_model = student.train_all(train_x, train_y, neu_x, neu_y)
            # print "evaluate"
            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}\tneu:{6}\t{7}".format(
                len(train_indices),
                accu,
                auc, query_cost,
                current_cost,
                format_spent(spent),
                len(neutral_answers), neu_y.shape[0]))

            ## the results should be based on the cost of the labeling
            if iteration > 0:   # bootstrap iteration

                student.budget -= query_cost ## Bootstrap doesn't count

                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)
                # partial trial results
                trial_accu.append([x_axis_range, accu])
                trial_aucs.append([x_axis_range, auc])

            iteration += 1
            # end of budget loop

        tac.append(trial_accu)
        tau.append(trial_aucs)
        #end trial loop
    if args.cost_function not in "uniform":
        accuracies = extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size)
        aucs = extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size)

    print("Elapsed time %.3f" % (time.time() - t0))
    print_extrapolated_results(accuracies, aucs)

Example #5

Show file

File: traintestLR.py Project: mramire8/active

def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = CountVectorizer(encoding='latin-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
    vct_analizer = vct.build_tokenizer()
    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = max(10, args.fixk)

    if args.fixk < 0:
        args.fixk = None

    # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5)
    # fixk_saved = "{0}{1}.p".format(args.train, args.fixk)

    data, vct = load_from_file(args.train, categories, args.fixk, min_size, vct)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    #### COST MODEL
    parameters = parse_parameters_mat(args.cost_model)
    print "Cost Parameters %s" % parameters
    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__

    #### ACCURACY MODEL
    accu_parameters = parse_parameters_mat(args.accu_model)

    #### CLASSIFIER
    clf = set_classifier(args.classifier)
    print "\nClassifier: %s" % clf

    #### EXPERT MODEL

    if "fixed" in args.expert:
        expert = baseexpert.FixedAccuracyExpert(accuracy_value=accu_parameters[0],
                                                cost_function=cost_model.cost_function)  #average value of accuracy of the experts
    elif "true" in args.expert:
        expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function)
    elif "linear" in args.expert:
        #expert = baseexpert.LRFunctionExpert(model=[0.0019, 0.6363],cost_function=cost_model.cost_function)
        raise Exception("We do not know linear yet!!")
    elif "log" in args.expert:
        expert = baseexpert.LogFunctionExpert(model=accu_parameters, cost_function=cost_model.cost_function)
    elif "direct" in args.expert:
        expert = baseexpert.LookUpExpert(accuracy_value=accu_parameters, cost_function=cost_model.cost_function)
    elif "neutral" in args.expert:
        exp_clf = LogisticRegression(penalty='l1', C=1)
        exp_clf.fit(data.test.bow, data.test.target)
        expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                         cost_function=cost_model.cost_function)
    else:
        raise Exception("We need a defined cost function options [fixed|log|linear]")

    exp_clf = LogisticRegression(penalty='l1', C=args.expert_penalty)
    exp_clf.fit(data.test.bow, data.test.target)
    print "\nExpert: %s " % expert
    coef = exp_clf.coef_[0]
    # print_features(coef, vct.get_feature_names())
    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          50))

    t0 = time.time()
    tac = []
    tau = []
    ### experiment starts
    for t in range(args.trials):
        trial_accu = []

        trial_aucs = []

        print "*" * 60
        print "Trial: %s" % t
        if  args.student in "unc":
            student = randomsampling.UncertaintyLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t,
                                                        subpool=250)
        else:
            student = randomsampling.RandomSamplingLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t)

        print "\nStudent: %s " % student

        train_indices = []
        train_x = []
        train_y = []
        pool = Bunch()
        pool.data = data.train.bow.tocsr()   # full words, for training
        if args.fixk is None:
            pool.fixk = data.train.bow.tocsr()
        else:
            pool.fixk = data.train.bowk.tocsr()  # k words BOW for querying
        pool.target = data.train.target
        pool.predicted = []
        # pool.kwords = np.array(data.train.kwords)  # k words
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool


        bootstrapped = False

        current_cost = 0
        iteration = 0
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:

            if not bootstrapped:
                ## random bootstrap
                #bt = randomsampling.BootstrapRandom(random_state=t * 10)

                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:
                query_index = student.pick_next(pool=pool, k=step_size)

            # query = pool.fixk[query_index]  # query with k words
            query = pool.data[query_index]
            # print query_index
            # query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]]
            query_size = [1]*query.shape[0]

            ground_truth = pool.target[query_index]

            if iteration == 0: ## bootstrap uses ground truth
                labels = ground_truth
                spent = [0] * len(ground_truth)
            else:
                labels = expert.label_instances(query, ground_truth)
                spent = expert.estimate_instances(query_size)

            query_cost = np.array(spent).sum()
            current_cost += query_cost

            # train_indices.extend(query_index)

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # add labels to training
            # train_x = pool.data[train_indices]  ## train with all the words

            # update labels with the expert labels
            useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None])
            if useful_answers.shape[0] != 0:
                train_indices.extend(useful_answers[:, 0])
                # add labels to training
                train_x = pool.data[train_indices]  ## train with all the words
                # update labels with the expert labels
                train_y.extend(useful_answers[:, 1])


            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # retrain the model
            current_model = student.train(train_x, train_y)
            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print (
            "TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices), accu,
                                                                                              auc, query_cost,
                                                                                              current_cost, format_spent(spent)))

            ## the results should be based on the cost of the labeling
            if iteration > 0: # bootstrap iteration

                student.budget -= query_cost ## Bootstrap doesn't count

                #x_axis_range = int(current_cost / eval_range)
                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)
                trial_accu.append([x_axis_range, accu])
                trial_aucs.append([x_axis_range, auc])

            iteration += 1
            # end of budget loop

        tac.append(trial_accu)
        tau.append(trial_aucs)
        #end trial loop
    if args.cost_function not in "uniform":
        accuracies = extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size)
        aucs = extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size)

    print("Elapsed time %.3f" % (time.time() - t0))
    print_extrapolated_results(accuracies, aucs)