Exemple #1
0
 def getTrainingTestSetSplit(self,
                             trainingSetPercentageSplit=0.8,
                             randomStateSeed=12345):
     print('starting getTrainingTestSetSplit')
     trainingSet = Bunch()
     testSet = Bunch()
     articleIdsSet = shuffle(list(
         set([elem[0] for elem in (self.dataset).data])),
                             random_state=randomStateSeed)
     articlesIdsTestSet = articleIdsSet[0:int(
         math.floor(((
             (1.0 - trainingSetPercentageSplit) * len(articleIdsSet)) -
                     1)))]
     trainingSet.data = [
         elem for elem in (self.dataset).data
         if elem[0] not in articlesIdsTestSet
     ]
     trainingSet.target = [
         (self.dataset).target[elemIndex]
         for elemIndex in range(len((self.dataset).target))
         if (self.dataset).data[elemIndex][0] not in articlesIdsTestSet
     ]
     trainingSet.target_names = (self.dataset).target_names
     testSet.data = [
         elem for elem in (self.dataset).data
         if elem[0] in articlesIdsTestSet
     ]
     testSet.target = [
         (self.dataset).target[elemIndex]
         for elemIndex in range(len((self.dataset).target))
         if (self.dataset).data[elemIndex][0] in articlesIdsTestSet
     ]
     testSet.target_names = (self.dataset).target_names
     print('ended getTrainingTestSetSplit')
     return (trainingSet, testSet)
Exemple #2
0
    def getTainingTestSetSplit(self, trainingSetPercentageSplit= 0.6, randomStateSeed= 12345):
        trainingSet = Bunch()
        testSet = Bunch()
        X_train, X_test, y_train, y_test = train_test_split(self.dataset.data, self.dataset.target, test_size = 1 - trainingSetPercentageSplit, random_state = randomStateSeed)
        trainingSet.data = X_train
        trainingSet.target = y_train
        trainingSet.target_names = (self.dataset).target_names

        testSet.data = X_test
        testSet.target = y_test
        testSet.target_names = (self.dataset).target_names
        return (trainingSet, testSet)
    def randomUndersamplingForBinaryClassification(self,
                                                   dataset,
                                                   randomSeed=12345):
        b2 = Bunch()

        b2.data = []
        b2.target = []
        b2.target_names = dataset.target_names

        datasetTemp = Bunch()
        datasetTemp.data = []
        datasetTemp.target = []

        # shuffle dataset
        datasetTemp.data, datasetTemp.target = shuffle(dataset.data,
                                                       dataset.target,
                                                       random_state=randomSeed)

        positiveExamplesIndexes = [
            i for i in xrange(len(datasetTemp.target))
            if datasetTemp.target[i] == 1
        ]

        numberOfPositiveExamples = len(positiveExamplesIndexes)
        numberOfNegativeExamples = len(
            datasetTemp.target) - len(positiveExamplesIndexes)

        numberOfPositiveExamplesToKeep = 0
        numberOfNegativeExamplesToKeep = 0

        if numberOfPositiveExamples > numberOfNegativeExamples:
            numberOfPositiveExamplesToKeep = numberOfNegativeExamples
            numberOfNegativeExamplesToKeep = numberOfNegativeExamples
        else:
            numberOfNegativeExamplesToKeep = numberOfPositiveExamples
            numberOfPositiveExamplesToKeep = numberOfPositiveExamples

        for trainingExampleIndex in xrange(len(datasetTemp.target)):

            if datasetTemp.target[trainingExampleIndex] == 1:
                if numberOfPositiveExamplesToKeep > 0:
                    (b2.data).append(datasetTemp.data[trainingExampleIndex])
                    (b2.target).append(1)
                    numberOfPositiveExamplesToKeep -= 1
            else:
                if numberOfNegativeExamplesToKeep > 0:
                    (b2.data).append(datasetTemp.data[trainingExampleIndex])
                    (b2.target).append(0)
                    numberOfNegativeExamplesToKeep -= 1

        return b2
Exemple #4
0
    def train(self, items):
        topic_words = self.topic(items)

        sentences = self.items2sentences(items)
        texts = self.sentences2texts(sentences)
        id2word = self.sentences2dict(sentences)

        cats = sorted(list(set([item["cat"] for item in items])))

        test1 = Bunch()
        test1.target = [cats.index(item['cat']) for item in items]
        test1.target_names = cats
        #print test1['target']

        test1.data = []
        for item in items:
            row = []
            for topic_word in topic_words:
                if topic_word in item["q"]:
                    row.append(1)
                else:
                    row.append(0)
            test1.data.append(row)
        #print test1['data'][0]

        #X_train = X_train_data.as_matrix()
        #y_train = y_train_data.as_matrix()

        clf = Pipeline([
            #("imputer", Imputer(missing_values='NaN', strategy="mean", axis=0)),
            #        ('feature_selection', VarianceThreshold(threshold=(.97 * (1 - .97)))),
            #    ('feature_selection', SelectKBest(chi2, k=50)),
            #    ('scaler', StandardScaler()),
            #        ('classification', svm.SVC(class_weight='balanced', cache_size=10240))])
            #        ('classification', svm.LinearSVC(class_weight='balanced'))
            #        ('classification', SGDClassifier(n_jobs=-1))
            ('classification', GradientBoostingClassifier())
        ])
        text_clf = clf.fit(test1.data, test1.target)

        #3fold
        scores = cross_validation.cross_val_score(text_clf,
                                                  test1.data,
                                                  test1.target,
                                                  cv=3)
        print scores
        print scores.mean(), scores.std()

        #confusion
        predicted = text_clf.predict(test1.data)
        print(
            metrics.classification_report(test1.target,
                                          predicted,
                                          target_names=test1.target_names))

        print(metrics.confusion_matrix(test1.target, predicted))
        #predicted = text_clf.predict(docs_test)
        #metrics.confusion_matrix(test1.target, predicted)

        return text_clf
def combine_sessions(sessions, **kwargs):
    """ Merge session data sets in single data set.
    """
    # make a copy of the sessions, just to be safe
    sessions_ = list(sessions)

    # define dataset based on first session
    dataset_ = None 

    # append data from other sessions
    for i, session_ in enumerate(sessions_):
        print("[+] session: {}, file: {}".format(
            i, session_.meta.reset_index(drop=False).session[0])
            )
        if dataset_ is None:
        	dataset_ = Bunch(**dict(session_))
        else:
        	dataset_.data = dataset_.data.append(session_.data, ignore_index=True, sort=False)
        	dataset_.meta = dataset_.meta.append(session_.meta, ignore_index=False, sort=False)
        	dataset_.tmask = dataset_.tmask.append(session_.tmask, ignore_index=False, sort=False)       
        
    # clean
    if kwargs.get('clean_meta'):
	    dataset_.meta = clean_meta(dataset_.meta, **kwargs).reset_index(drop=False)

    # set X, y
    dataset_.X = dataset_.data.values.reshape(-1, dataset_.data.shape[-1])
    dataset_.y = dataset_.meta.values.reshape(-1, dataset_.meta.shape[-1])

    # cache sessions
    dataset_.sessions = list(sessions)
    return dataset_
def createWikiDast(path_to_corpus):
    lineNum = 0
    printLine = 0
    data = []
    target = []
    with open(path_to_corpus) as raw_c:
        dir = os.path.abspath(
            os.path.join(path_to_corpus, os.pardir))  # TODO check that this is the parent directory of the file.
        print(dir)
        line = 'first'
        while line != "":
            line = raw_c.readline()

            lineNum += 1
            printLine += 1
            if printLine == 1000000:
                print(lineNum)
                printLine = 0
            if lineNum == 364270:
                break
            data.append(line)
            target.append(1)
    dast = Bunch()
    dast.data = data
    dast.target = target
    # dast.target = numpy.zeros(shape=(lineNum), dtype='int32')
    # docs = {'data': data, 'target':np.asarray(target)}
    return dast
 def shuffleData(self, res):
     shuffle(res)
     train = Bunch()
     train.data = map(lambda x:x[1], res)
     train.target = map(lambda x:x[0], res)
     train.target_names = self.names
     return train
Exemple #8
0
 def shuffleData(self, res):
     shuffle(res)
     train = Bunch()
     train.data = map(lambda x: x[1], res)
     train.target = map(lambda x: x[0], res)
     train.target_names = self.names
     return train
def createFullCategoryESWiki(enPathList, simPathList):
    data = []
    target = []
    for index, enPath in enumerate(enPathList):
        d, t = createESWiki(enPath, simPathList[index])
        data.extend(d)
        target.extend(t)

    docs = Bunch()
    docs.data = data
    docs.target = target
    return docs
Exemple #10
0
def get_data(whichData='train'):
    dataset = Bunch()
    dataset.data = np.array([]) 
    dataset.target = np.array([])

    if whichData=='train':
        data = d3mds.get_train_data()
        targets = d3mds.get_train_targets()
    elif whichData=='test':
        data = d3mds.get_test_data()
        targets = d3mds.get_test_targets()
    else:
        raise RuntimeError('get_data should be passed either train or test, but got%s'%whichData)

    for i, rf in enumerate(data['raw_text_file']):
        path = os.path.join(textPath, rf)
        raw = open(path, encoding='utf-8').read()
        dataset.data = np.append(dataset.data, raw)
    dataset.target = targets.ravel()

    return dataset
Exemple #11
0
def split_data(data_set, split_amount):
    data = Bunch()
    data.data = data_set.values[:, 0:-1]
    data.target = data_set.values[:, -1]

    split_index = int(split_amount * len(data.data))
    indices = np.random.permutation(len(data.data))
    # indices = range(len(data.data))

    train_data = data.data[indices[:split_index]]
    train_target = data.target[indices[:split_index]]

    test_data = data.data[indices[split_index:]]
    test_target = data.target[indices[split_index:]]

    return train_data, train_target, test_data, test_target
def createTwitterDast(path):
    csv.field_size_limit(sys.maxsize)
    data = []
    target = []
    with open(path, 'rb') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        for row in spamreader:
            sent = ""
            for counter, word in enumerate(row):
                if counter > 5 and counter + 1 < len(row):
                    sent = sent + " " + word
            data.append(sent)
            target.append(0)
            # print sent
    docs = Bunch()
    docs.data = data
    docs.target = target
    return docs
def createPWKP(path):
    sentPack = []
    data = []
    target = []
    with open(path) as f:
        line = f.readline()
        while line != "":
            while line != "\n":
                sentPack.append(line)
                line = f.readline()
            if len(sentPack) != 0:
                data.append(sentPack[0])
                target.append(0)
                data.append(sentPack[1])
                target.append(1)
            sentPack = []
            line = f.readline()
    docs = Bunch()
    docs.data = data
    docs.target = target
    return docs
Exemple #14
0
def load_subject_data(dataset,
                      index=0,
                      mask='mask_vt',
                      sample_mask=None,
                      smoothing_fwhm=4,
                      **kwargs):
    """ Load functional data for a single haxby subject. """
    # extract relevant files
    func_fn = dataset.func[index]
    mask_fn = dataset.get(mask)
    if not isinstance(mask_fn, str):
        mask_fn = mask_fn[index]

    # extract data from func using mask_vt
    masker = NiftiMasker(
        mask_img=mask_fn,
        sample_mask=sample_mask,
        standardize=True,
        detrend=True,
        smoothing_fwhm=smoothing_fwhm,
        low_pass=0.09,
        high_pass=0.008,
        t_r=2.5,
        memory="nilearn_cache",
    )
    X = masker.fit_transform(func_fn)
    data = pd.DataFrame(X)

    # return as bunch
    subject = Bunch()
    subject.data = data
    subject.X = X
    subject.masker = masker
    subject.mask = mask_fn
    subject.func = func_fn
    subject.subject_code = os.path.basename(os.path.dirname(func_fn))
    return subject
Exemple #15
0
def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = CountVectorizer(encoding='latin-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
    vct_analizer = vct.build_tokenizer()
    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = max(10, args.fixk)

    if args.fixk < 0:
        args.fixk = None

    # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5)
    # fixk_saved = "{0}{1}.p".format(args.train, args.fixk)

    data, vct = load_from_file(args.train, categories, args.fixk, min_size, vct)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    #### COST MODEL
    parameters = parse_parameters_mat(args.cost_model)
    print "Cost Parameters %s" % parameters
    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__

    #### ACCURACY MODEL
    accu_parameters = parse_parameters_mat(args.accu_model)

    #### CLASSIFIER
    clf = set_classifier(args.classifier)
    print "\nClassifier: %s" % clf

    #### EXPERT MODEL

    if "fixed" in args.expert:
        expert = baseexpert.FixedAccuracyExpert(accuracy_value=accu_parameters[0],
                                                cost_function=cost_model.cost_function)  #average value of accuracy of the experts
    elif "true" in args.expert:
        expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function)
    elif "linear" in args.expert:
        #expert = baseexpert.LRFunctionExpert(model=[0.0019, 0.6363],cost_function=cost_model.cost_function)
        raise Exception("We do not know linear yet!!")
    elif "log" in args.expert:
        expert = baseexpert.LogFunctionExpert(model=accu_parameters, cost_function=cost_model.cost_function)
    elif "direct" in args.expert:
        expert = baseexpert.LookUpExpert(accuracy_value=accu_parameters, cost_function=cost_model.cost_function)
    elif "neutral" in args.expert:
        exp_clf = LogisticRegression(penalty='l1', C=1)
        exp_clf.fit(data.test.bow, data.test.target)
        expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                         cost_function=cost_model.cost_function)
    else:
        raise Exception("We need a defined cost function options [fixed|log|linear]")

    exp_clf = LogisticRegression(penalty='l1', C=args.expert_penalty)
    exp_clf.fit(data.test.bow, data.test.target)
    print "\nExpert: %s " % expert
    coef = exp_clf.coef_[0]
    # print_features(coef, vct.get_feature_names())
    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          50))

    t0 = time.time()
    tac = []
    tau = []
    ### experiment starts
    for t in range(args.trials):
        trial_accu = []

        trial_aucs = []

        print "*" * 60
        print "Trial: %s" % t
        if  args.student in "unc":
            student = randomsampling.UncertaintyLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t,
                                                        subpool=250)
        else:
            student = randomsampling.RandomSamplingLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t)

        print "\nStudent: %s " % student

        train_indices = []
        train_x = []
        train_y = []
        pool = Bunch()
        pool.data = data.train.bow.tocsr()   # full words, for training
        if args.fixk is None:
            pool.fixk = data.train.bow.tocsr()
        else:
            pool.fixk = data.train.bowk.tocsr()  # k words BOW for querying
        pool.target = data.train.target
        pool.predicted = []
        # pool.kwords = np.array(data.train.kwords)  # k words
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool


        bootstrapped = False

        current_cost = 0
        iteration = 0
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:

            if not bootstrapped:
                ## random bootstrap
                #bt = randomsampling.BootstrapRandom(random_state=t * 10)

                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:
                query_index = student.pick_next(pool=pool, k=step_size)

            # query = pool.fixk[query_index]  # query with k words
            query = pool.data[query_index]
            # print query_index
            # query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]]
            query_size = [1]*query.shape[0]

            ground_truth = pool.target[query_index]

            if iteration == 0: ## bootstrap uses ground truth
                labels = ground_truth
                spent = [0] * len(ground_truth)
            else:
                labels = expert.label_instances(query, ground_truth)
                spent = expert.estimate_instances(query_size)

            query_cost = np.array(spent).sum()
            current_cost += query_cost

            # train_indices.extend(query_index)

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # add labels to training
            # train_x = pool.data[train_indices]  ## train with all the words

            # update labels with the expert labels
            useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None])
            if useful_answers.shape[0] != 0:
                train_indices.extend(useful_answers[:, 0])
                # add labels to training
                train_x = pool.data[train_indices]  ## train with all the words
                # update labels with the expert labels
                train_y.extend(useful_answers[:, 1])


            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # retrain the model
            current_model = student.train(train_x, train_y)
            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print (
            "TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices), accu,
                                                                                              auc, query_cost,
                                                                                              current_cost, format_spent(spent)))

            ## the results should be based on the cost of the labeling
            if iteration > 0: # bootstrap iteration

                student.budget -= query_cost ## Bootstrap doesn't count

                #x_axis_range = int(current_cost / eval_range)
                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)
                trial_accu.append([x_axis_range, accu])
                trial_aucs.append([x_axis_range, auc])

            iteration += 1
            # end of budget loop

        tac.append(trial_accu)
        tau.append(trial_aucs)
        #end trial loop
    if args.cost_function not in "uniform":
        accuracies = extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size)
        aucs = extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size)

    print("Elapsed time %.3f" % (time.time() - t0))
    print_extrapolated_results(accuracies, aucs)
Exemple #16
0
# -*- coding: utf-8 -*-

import pandas
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import StandardScaler
from sklearn.datasets.base import Bunch

# 1. Загрузите обучающую и тестовую выборки из файлов perceptron-
# train.csv и perceptron-test.csv. Целевая переменная записана в первом
# столбце, признаки — во втором и третьем.

data = pandas.read_csv('perceptron-train.csv', header=None)
train, test = Bunch(), Bunch()
train.data, train.target = data.loc[:, 1:], data.loc[:, 0]
data = pandas.read_csv('perceptron-test.csv', header=None)
test.data, test.target = data.loc[:, 1:], data.loc[:, 0]

# 2. Обучите персептрон со стандартными параметрами и random_state=241

perc = Perceptron(random_state=241)
perc.fit(train.data, train.target)  # learning

# 3. Подсчитайте качество (долю правильно классифицированных объ-
# ектов, accuracy) полученного классификатора на тестовой выборке.

accuracy = perc.score(test.data, test.target)  # predicting
print accuracy

# 4. Нормализуйте обучающую и тестовую выборку с помощью класса
# StandardScaler.
import numpy as np
from skimage import io
from sklearn.datasets.base import Bunch

from dip.load_data import load_image_files, load_mask_images
from dip.mask import bounding_rect_of_mask


datasets = load_mask_images()

data = []
for f, mask in zip(
        datasets.filenames,
        load_image_files(datasets.filenames),
        ):
    # rect: (min_x, max_x, min_y, max_x)
    rect = bounding_rect_of_mask(mask, negative=True)
    data.append(list(rect))
    print('{0}: {1}'.format(f, rect))

bunch = Bunch(name='mask rects')
bunch.data = np.array(data)
bunch.filenames = datasets.filenames
bunch.target = datasets.target
bunch.target_names = datasets.target_names
bunch.description = 'mask rects: (min_x, min_y, max_x, max_y)'

with gzip.open('rects.pkl.gz', 'wb') as f:
    pickle.dump(bunch, f)
Exemple #18
0
def mlviz_two(_, a, b, c):
    import numpy as np
    import pandas as pd
    from sklearn import datasets
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score, log_loss
    import xgboost as xgb
    import json
    from sklearn.datasets.base import Bunch
    from sklearn.preprocessing import LabelEncoder
    from sklearn.base import BaseEstimator, TransformerMixin
    from sklearn.impute import SimpleImputer
    from sklearn.svm import SVC
    data = a
    train = b
    test = c
    names = [
        'age', 'workclass', 'fnlwgt', 'education', 'education-num',
        'martial-status', 'occupation', 'relationship', 'race', 'sex',
        'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
        'income'
    ]
    meta = {
        'target_names': list(data.income.unique()),
        'feature_names': list(data.columns),
        'categorical_features': {
            column: list(data[column].unique())
            for column in data.columns if data[column].dtype == 'object'
        }
    }
    names = meta['feature_names']
    meta['categorical_features'].pop('income')
    dataset = Bunch(data=train[names[:-1]],
                    target=train[names[-1]],
                    data_test=test[names[:-1]],
                    target_test=test[names[-1]],
                    target_names=meta['target_names'],
                    feature_names=meta['feature_names'],
                    categorical_features=meta['categorical_features'],
                    DESCR="descr")

    # return dataset

    class EncodeCategorical(BaseEstimator, TransformerMixin):
        """
        Encodes a specified list of columns or all columns if None.
        """
        def __init__(self, columns=None):
            self.columns = columns
            self.encoders = None

        def fit(self, data, target=None):
            """
            Expects a data frame with named columns to encode.
            """
            # Encode all columns if columns is None
            if self.columns is None:
                self.columns = data.columns
            # Fit a label encoder for each column in the data frame
            self.encoders = {
                column: LabelEncoder().fit(data[column])
                for column in self.columns
            }
            return self

        def transform(self, data):
            """
            Uses the encoders to transform a data frame.
            """
            output = data.copy()
            for column, encoder in self.encoders.items():
                output[column] = encoder.transform(data[column])
            return output

    encoder = EncodeCategorical(dataset.categorical_features.keys())
    dataset.data = encoder.fit_transform(dataset.data)
    dataset.data_test = encoder.fit_transform(dataset.data_test)

    # return dataset

    class ImputeCategorical(BaseEstimator, TransformerMixin):
        """
        Encodes a specified list of columns or all columns if None.
        """
        def __init__(self, columns=None):
            self.columns = columns
            self.imputer = None

        def fit(self, data, target=None):
            """
            Expects a data frame with named columns to impute.
            """
            # Encode all columns if columns is None
            if self.columns is None:
                self.columns = data.columns
            # Fit an imputer for each column in the data frame
            self.imputer = SimpleImputer(missing_values=0,
                                         strategy='most_frequent')
            self.imputer.fit(data[self.columns])
            return self

        def transform(self, data):
            """
            Uses the encoders to transform a data frame.
            """
            output = data.copy()
            output[self.columns] = self.imputer.transform(output[self.columns])
            return output

    imputer = ImputeCategorical(['workclass', 'native-country', 'occupation'])
    dataset.data = imputer.fit_transform(dataset.data)
    dataset.data_test = imputer.fit_transform(dataset.data_test)

    X_train = dataset.data
    yencode = LabelEncoder().fit(dataset.target)
    y_train = yencode.transform(dataset.target)

    X_test = dataset.data_test
    y_test = yencode.transform([y.rstrip(".") for y in dataset.target_test])

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    def grid_test_xgboost(colsample_tree, subsample, max_depth,
                          min_child_weight, eta):
        # train model
        params = {
            'objective': 'multi:softprob',
            'num_class': 2,
            'eval_metric': 'mlogloss',
            'max_depth': max_depth,
            'min_child_weight': min_child_weight,
            'eta': eta,
            'subsample': subsample,
            'colsample_bytree': colsample_tree
        }
        model = xgb.train(params,
                          dtrain,
                          evals=[(dtrain, 'train')],
                          verbose_eval=False)

        # evaluate model
        y_proba = model.predict(dtest)
        y_pred = y_proba.argmax(axis=1)
        loss = log_loss(y_test, y_proba)
        acc = accuracy_score(y_test, y_pred)

        return acc

    def grid_test_svm(kernel, gamma, C):
        clf = SVC(kernel=kernel, gamma=gamma, C=C).fit(X_train, y_train)
        accuracy = clf.score(X_test, y_test)
        return accuracy

    # colsample_tree = [1.0]
    # subsample = [1.0]
    # max_depth = [1, 10]
    # min_child_weight = [1, 10]
    # eta = [.9, .3, .01, .005]

    colsample_tree = [1.0]
    subsample = [1.0]
    max_depth = [1]
    min_child_weight = [1]
    eta = [.9]
    val = None

    for i in colsample_tree:
        for j in subsample:
            for k in max_depth:
                for l in min_child_weight:
                    for m in eta:
                        val = grid_test_xgboost(i, j, k, l, m)

    return val
Exemple #19
0
def train(classify_name):
    digits = Bunch()
    digits.data = []
    digits.target = []
    digits.target_names = []
    parent_path = classify_name
    for category in os.listdir(parent_path):
        full_category_path = os.path.join(parent_path, category)
        if not os.path.isdir(full_category_path):
            continue
        for file in os.listdir(full_category_path):
            full_file_path = os.path.join(full_category_path, file)
            im = cv2.imread(full_file_path)
            im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
            im = cv2.resize(im, (30, 30), interpolation=cv2.INTER_AREA)
            im = np.array(im, 'float64')
            digits.data.append(im)
            digits.target.append(category)
        if category not in digits.target_names:
            digits.target_names.append(category)

    # Extract the features and labels
    features = np.array(digits.data, 'int16')
    labels = np.array(digits.target, 'string')

    # Extract the hog features
    list_hog_fd = []
    for feature in features:
        fd = hog(feature.reshape((30, 30)),
                 orientations=9,
                 pixels_per_cell=(14, 14),
                 cells_per_block=(1, 1),
                 visualise=False)
        list_hog_fd.append(fd)
    hog_features = np.array(list_hog_fd, 'float64')

    # Normalize the features
    pp = preprocessing.StandardScaler().fit(hog_features)
    hog_features = pp.transform(hog_features)

    print "training..."
    from sklearn import svm
    clf = svm.SVC(gamma=0.001, C=100.)
    clf.fit(hog_features, labels)
    # Save the classifier
    from sklearn.externals import joblib
    joblib.dump((clf, pp), 'PKL/%s.pkl' % classify_name, compress=3)

    print "testing..."
    correct = 0
    incorrect = 0
    incorrect_list = {}
    for category in os.listdir(parent_path):
        full_category_path = os.path.join(parent_path, category)
        if not os.path.isdir(full_category_path):
            continue
        for file in os.listdir(full_category_path):
            full_file_path = os.path.join(full_category_path, file)
            pic_data = cv2.imread(full_file_path)
            pic_data = cv2.cvtColor(pic_data, cv2.COLOR_BGR2GRAY)
            pic_data = cv2.resize(pic_data, (30, 30),
                                  interpolation=cv2.INTER_AREA)
            pic_data = np.array(pic_data, 'int16')
            pic_hog_fd = hog(pic_data.reshape((30, 30)),
                             orientations=9,
                             pixels_per_cell=(14, 14),
                             cells_per_block=(1, 1),
                             visualise=False)
            pic_hog_fd = pp.transform(np.array([pic_hog_fd], 'float64'))
            print full_file_path
            prediction = clf.predict(pic_hog_fd)[0]
            print prediction
            if prediction == category:
                correct += 1
            else:
                incorrect += 1
                incorrect_list.update({full_file_path: prediction})
    return correct, incorrect, incorrect_list
Exemple #20
0
 def getCompleteDataset(self):
     b= Bunch()
     b.data= (self.dataset).data
     b.target= (self.dataset).target
     b.target_names= (self.dataset).target_names
     return b
Exemple #21
0
    else:
        print(filename, 'is not a regular file.', file=sys.stderr, flush=True)
    print(status_update(filename, no_samples, end), flush=True)
    no_samples += 1

if len(data) is 0 or len(target) is 0:
    print('Data array collection error: no data found.',
          file=sys.stderr,
          flush=True)
    exit(1)

X = np.asarray(data)
y = np.asarray(target)

samples = Bunch()
samples.data = data
samples.target = target
samples_file = path.join(args.directory, 'poly2d.pkl.xz')
joblib.dump(samples, samples_file)

cv_neighbors = 5
knn = KNeighborsClassifier(n_neighbors=cv_neighbors, n_jobs=-1)
knn.fit(X, y)
model_file = path.join(args.directory, 'knn_model.pkl.xz')
joblib.dump(knn, model_file)

cv_folds = 5
try:
    scores = cross_val_score(knn, X, y, cv=cv_folds)
except ValueError as e:
    message = 'Error computing cross_val_score.'
Exemple #22
0
def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
    vct_analizer = vct.build_tokenizer()
    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = max(50, args.fixk)

    if "imdb" in args.train:
        ########## IMDB MOVIE REVIEWS ###########
        data = load_imdb(args.train, shuffle=True, rnd=2356, vct=vct, min_size=min_size,
                               fix_k=args.fixk)  # should brind data as is
    elif "aviation" in args.train:
        raise Exception("We are not ready for that data yet")
    elif "20news" in args.train:
        ########## 20 news groups ######
        data = load_20newsgroups(categories=categories[0], vectorizer=vct, min_size=min_size,
                                       fix_k=args.fixk)  # for testing purposes
    elif "dummy" in args.train:
        ########## DUMMY DATA###########
        data = load_dummy("C:/Users/mramire8/Documents/code/python/data/dummy", shuffle=True,
                                rnd=2356, vct=vct, min_size=0, fix_k=args.fixk)
    else:
        raise Exception("We do not know that dataset")

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))
    #print(data.train.data[0])
    #### COST MODEL
    parameters = parse_parameters(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(parameters)

    print "\nCost Model: %s" % cost_model.__class__.__name__

    #### ACCURACY MODEL
    # try:
    # #     accu_parameters = parse_parameters(args.accu_model)
    # except ValueError:
    accu_parameters = parse_parameters_mat(args.accu_model)
    # else
    #     print("Error: Accuracy parameters didn't work")

    print "Accuracy Parameters %s" % accu_parameters
    #if "fixed" in args.accu_function:
    #    accuracy_model = base_models.FixedAccuracyModel(accuracy_value=.7)
    #elif "log" in args.accu_function:
    #    accuracy_model = base_models.LogAccuracyModel(model=parameters)
    #elif "linear" in args.accu_function:
    #    accuracy_model = base_models.LRAccuracyModel(model=parameters)
    #else:
    #    raise Exception("We need a defined cost function options [fixed|log|linear]")
    #
    #print "\nAccuracy Model: %s " % accuracy_model

    #### CLASSIFIER
    #### Informed priors
    #feature_counts = np.ones(x_train.shape[0]) * x_train
    #feature_frequencies = feature_counts / np.sum(feature_counts)
    #alpha = feature_frequencies
    alpha = 1
    clf = MultinomialNB(alpha=alpha)
    print "\nClassifier: %s" % clf

    #### EXPERT MODEL
    #expert = baseexpert.BaseExpert()
    if "fixed" in args.expert:
        expert = baseexpert.FixedAccuracyExpert(accuracy_value=accu_parameters[0],
                                                cost_function=cost_model.cost_function)  #average value of accuracy of the experts
    elif "true" in args.expert:
        expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function)
    elif "linear" in args.expert:
        #expert = baseexpert.LRFunctionExpert(model=[0.0019, 0.6363],cost_function=cost_model.cost_function)
        raise Exception("We do not know linear yet!!")
    elif "log" in args.expert:
        expert = baseexpert.LogFunctionExpert(model=accu_parameters, cost_function=cost_model.cost_function)
    elif "direct" in args.expert:
        expert = baseexpert.LookUpExpert(accuracy_value=accu_parameters, cost_function=cost_model.cost_function)
    else:
        raise Exception("We need a defined cost function options [fixed|log|linear]")
        #expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function)
    print "\nExpert: %s " % expert

    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200
    eval_range = 1 if (args.budget / evaluation_points) <= 0 else args.budget / evaluation_points
    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          50))

    t0 = time.time()
    ### experiment starts
    for t in range(args.trials):
        print "*" * 60
        print "Trial: %s" % t
        # TODO shuffle the data??
        #student = baselearner.BaseLearner(model=clf, cost_model=cost_model, accuracy_model=accuracy_model, budget=args.budget,
        #                                  seed=t)
        student = randomsampling.RandomSamplingLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t)
        print "\nStudent: %s " % student
        train_indices = []
        train_x = []
        train_y = []
        pool = Bunch()
        pool.data = data.train.bow.tocsr()   # full words, for training
        pool.fixk = data.train.bowk.tocsr()  # k words BOW for querying
        pool.target = data.train.target
        pool.predicted = []
        pool.kwords = np.array(data.train.kwords)  # k words
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool

        #for x in pool.fixk:
        #    print x.todense().sum()

        bootstrapped = False

        current_cost = 0
        iteration = 0
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:

            if not bootstrapped:
                ## random bootstrap
                #bt = randomsampling.BootstrapRandom(random_state=t * 10)

                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:
                query_index = student.pick_next(pool=pool, k=step_size)

            query = pool.fixk[query_index]  # query with k words

            query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]]

            #if query_size[0] >50:
            #    print "*** %s" % pool.kwords[query_index]

            ground_truth = pool.target[query_index]
            #labels, spent = expert.label(unlabeled=query, target=ground_truth)
            if iteration == 0: ## bootstrap uses ground truth
                labels = ground_truth
            else:
                #labels = expert.label_instances(query, ground_truth)
                labels = expert.label_instances(query_size, ground_truth)
                #spent = expert.estimate_instances(pool.kwords[query_index])
            spent = expert.estimate_instances(query_size)

            query_cost = np.array(spent).sum()
            current_cost += query_cost

            train_indices.extend(query_index)

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # add labels to training
            train_x = pool.data[train_indices]  ## train with all the words

            # update labels with the expert labels
            #train_y = pool.target[train_indices]
            train_y.extend(labels)
            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # retrain the model
            current_model = student.train(train_x, train_y)
            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            #auc = metrics.roc_auc_score(data.test.target, y_probas[:,1])
            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print (
            "TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices), accu,
                                                                                              auc, query_cost,
                                                                                              current_cost, spent))

            ## the results should be based on the cost of the labeling
            if iteration > 0: # bootstrap iteration

                student.budget -= query_cost ## Bootstrap doesn't count

                #x_axis_range = int(current_cost / eval_range)
                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                #accuracies[len(train_indices)].append(accu)
                #aucs[len(train_indices)].append(auc)
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)
            iteration += 1
    print("Elapsed time %.3f" % (time() - t0))
    print_results(x_axis, accuracies, aucs)
    def getCrossValidationSplits(self, completeDataset, nSplits):

        datasetUniqueArticlesIds = list(
            set([elem["ArticleId"] for elem in completeDataset.data]))

        articleIdUniqueIntKeyDict = {}
        integerKey = 0

        for articleId in datasetUniqueArticlesIds:
            articleIdUniqueIntKeyDict[articleId] = integerKey
            integerKey = integerKey + 1

        trainingSetGroups = np.asarray([
            articleIdUniqueIntKeyDict[elem["ArticleId"]]
            for elem in completeDataset.data
        ])

        cvStrategyTrainingTestData = GroupKFold(n_splits=nSplits)
        cvFoldsTrainingTestData = cvStrategyTrainingTestData.split(
            completeDataset.data, completeDataset.target, trainingSetGroups)

        foldsPartition = []
        foldsPartitionIndexes = []

        for currentFoldTrainingSetIdx, currentFoldTestIdx in cvFoldsTrainingTestData:

            # get Training/Dev set partitions
            cvStrategyTrainingDevData = GroupKFold(n_splits=nSplits)
            cvFoldsTrainingDevData = cvStrategyTrainingDevData.split(
                completeDataset.data[currentFoldTrainingSetIdx],
                completeDataset.target[currentFoldTrainingSetIdx],
                trainingSetGroups[currentFoldTrainingSetIdx])

            cvFoldsSplits = [
                (trainingSetIdx, devSetIdx)
                for trainingSetIdx, devSetIdx in cvFoldsTrainingDevData
            ]

            currentFoldTrainingSetFinalIdx = currentFoldTrainingSetIdx[
                cvFoldsSplits[0][0]]
            currentFoldDevIdx = currentFoldTrainingSetIdx[cvFoldsSplits[0][1]]

            currentFoldTrainingSet = Bunch()
            currentFoldTrainingSet.data = completeDataset.data[
                currentFoldTrainingSetFinalIdx]
            currentFoldTrainingSet.target = completeDataset.target[
                currentFoldTrainingSetFinalIdx]
            currentFoldTrainingSet.target_names = completeDataset.target_names

            currentFoldDevSet = Bunch()
            currentFoldDevSet.data = completeDataset.data[currentFoldDevIdx]
            currentFoldDevSet.target = completeDataset.target[
                currentFoldDevIdx]
            currentFoldDevSet.target_names = completeDataset.target_names

            currentFoldTestSet = Bunch()
            currentFoldTestSet.data = completeDataset.data[currentFoldTestIdx]
            currentFoldTestSet.target = completeDataset.target[
                currentFoldTestIdx]
            currentFoldTestSet.target_names = completeDataset.target_names

            foldsPartition.append((currentFoldTrainingSet, currentFoldDevSet,
                                   currentFoldTestSet))
            foldsPartitionIndexes.append(
                (currentFoldTrainingSetFinalIdx, currentFoldDevIdx,
                 currentFoldTestIdx))

        # store training, development and test set on pickle file
        cvFoldsPartitionInfoFile = codecs.open(
            str(self.corpusPath) + "/" + "FoldsPartition" + "/" +
            str(self.corpusFilename) + "_DatasetPartition_" + str(nSplits) +
            "Folds.pkl",
            mode="w",
            encoding="utf-8")

        pickle.dump(foldsPartition, cvFoldsPartitionInfoFile)

        cvFoldsPartitionInfoFile.close()

        # store learning instances indexes for training, development and test set
        cvFoldsPartitionIndexesInfoFile = codecs.open(
            str(self.corpusPath) + "/" + "FoldsPartition" + "/" +
            str(self.corpusFilename) + "_DatasetPartition_" + str(nSplits) +
            "FoldsIndexes.pkl",
            mode="w",
            encoding="utf-8")

        pickle.dump(foldsPartitionIndexes, cvFoldsPartitionIndexesInfoFile)

        cvFoldsPartitionIndexesInfoFile.close()

        return True
Exemple #24
0
                             stringInput)
    return stringInput


trainingStrings = []
trainingStringCats = []
i = 0

for cat in trainingData:
    for trainingString in trainingData[cat]:
        trainingStrings.append(doRegexReplacement(trainingString))
        trainingStringCats.append(i)
    i += 1

chat = Bunch()
chat.data = trainingStrings
chat.target = trainingStringCats
chat.target_names = trainingData.keys()

text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words=['please'])),
    ('tfidf', TfidfTransformer()),
    ('clf',
     SGDClassifier(loss='hinge',
                   penalty='l2',
                   alpha=1e-3,
                   n_iter=5,
                   random_state=42)),
])
_ = text_clf.fit(chat.data, chat.target)
nextFunc = None
    def translateDataset(self, sourceLanguage, targetLanguage):

        print "\n Translating training set ..."

        trainingSetTranslations = Bunch()
        trainingSetTranslations.data = []
        trainingSetTranslations.target = []
        trainingSetTranslations.target_names = ["None", "Support"]

        # training set
        for learningInstanceIndex in xrange(len((self.trainingSet).target)):

            sourceADUContent = str(
                tb((self.trainingSet
                    ).data[learningInstanceIndex]["SourceADU"]).translate(
                        from_lang=sourceLanguage, to=targetLanguage))
            targetADUContent = str(
                tb((self.trainingSet
                    ).data[learningInstanceIndex]["TargetADU"]).translate(
                        from_lang=sourceLanguage, to=targetLanguage))

            (trainingSetTranslations.data).append({
                "SourceADU":
                sourceADUContent,
                "TargetADU":
                targetADUContent,
                "SourceADU_tokens":
                self.myTokenizer(sourceADUContent,
                                 lowercase=True,
                                 removePunctuationMarks=False),
                "TargetADU_tokens":
                self.myTokenizer(targetADUContent,
                                 lowercase=True,
                                 removePunctuationMarks=False),
                "ArticleId":
                (self.trainingSet).data[learningInstanceIndex]["ArticleId"]
            })

            (trainingSetTranslations.target).append(
                (self.trainingSet).target[learningInstanceIndex])

        print "\n Translating validation set ..."

        validationSetTranslations = Bunch()
        validationSetTranslations.data = []
        validationSetTranslations.target = []
        validationSetTranslations.target_names = ["None", "Support"]

        # validation set
        for learningInstanceIndex in xrange(len((self.validationSet).target)):

            sourceADUContent = str(
                tb((self.validationSet
                    ).data[learningInstanceIndex]["SourceADU"]).translate(
                        from_lang=sourceLanguage, to=targetLanguage))
            targetADUContent = str(
                tb((self.validationSet
                    ).data[learningInstanceIndex]["TargetADU"]).translate(
                        from_lang=sourceLanguage, to=targetLanguage))

            (validationSetTranslations.data).append({
                "SourceADU":
                sourceADUContent,
                "TargetADU":
                targetADUContent,
                "SourceADU_tokens":
                self.myTokenizer(sourceADUContent,
                                 lowercase=True,
                                 removePunctuationMarks=False),
                "TargetADU_tokens":
                self.myTokenizer(targetADUContent,
                                 lowercase=True,
                                 removePunctuationMarks=False),
                "ArticleId":
                (self.validationSet).data[learningInstanceIndex]["ArticleId"]
            })

            (validationSetTranslations.target).append(
                (self.validationSet).target[learningInstanceIndex])

        print "\n Translating test set ..."

        testSetTranslations = Bunch()
        testSetTranslations.data = []
        testSetTranslations.target = []
        testSetTranslations.target_names = ["None", "Support"]

        # training set
        for learningInstanceIndex in xrange(len((self.testSet).target)):

            sourceADUContent = str(
                tb((self.testSet
                    ).data[learningInstanceIndex]["SourceADU"]).translate(
                        from_lang=sourceLanguage, to=targetLanguage))
            targetADUContent = str(
                tb((self.testSet
                    ).data[learningInstanceIndex]["TargetADU"]).translate(
                        from_lang=sourceLanguage, to=targetLanguage))

            (testSetTranslations.data).append({
                "SourceADU":
                sourceADUContent,
                "TargetADU":
                targetADUContent,
                "SourceADU_tokens":
                self.myTokenizer(sourceADUContent,
                                 lowercase=True,
                                 removePunctuationMarks=False),
                "TargetADU_tokens":
                self.myTokenizer(targetADUContent,
                                 lowercase=True,
                                 removePunctuationMarks=False),
                "ArticleId":
                (self.testSet).data[learningInstanceIndex]["ArticleId"]
            })

            (testSetTranslations.data).target.append(
                (self.testSet).target[learningInstanceIndex])

        translationsFile = codecs.open(self.corpusPath + "/" +
                                       self.corpusFilename + "_translatedTo_" +
                                       targetLanguage + ".pkl",
                                       mode="w",
                                       encoding="utf-8")
        pickle.dump(
            {
                "trainingSet": trainingSetTranslations,
                "validationSet": validationSetTranslations,
                "testSet": testSetTranslations
            }, translationsFile)
        translationsFile.close()

        print "[Done] Translations"
Exemple #26
0
def main():
    print args
    print

    accuracies = defaultdict(lambda: [])

    ora_accu = defaultdict(lambda: [])

    oracle_accuracies =[]
    ora_cm = defaultdict(lambda: [])
    lbl_dit = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())

    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = 10

    args.fixk = None

    data, vct = load_from_file(args.train, [categories[3]], args.fixk, min_size, vct, raw=True)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    parameters = experiment_utils.parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = experiment_utils.set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__

    ### SENTENCE TRANSFORMATION
    if args.train == "twitter":
        sent_detector = TwitterSentenceTokenizer()
    else:
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    ## delete <br> to "." to recognize as end of sentence
    data.train.data = experiment_utils.clean_html(data.train.data)
    data.test.data = experiment_utils.clean_html(data.test.data)

    print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
    ## Get the features of the sentence dataset

    ## create splits of data: pool, test, oracle, sentences
    expert_data = Bunch()
    if not args.fulloracle:
        train_test_data = Bunch()

        expert_data.sentence, train_test_data.pool = split_data(data.train)
        expert_data.oracle, train_test_data.test = split_data(data.test)

        data.train.data = train_test_data.pool.train.data
        data.train.target = train_test_data.pool.train.target

        data.test.data = train_test_data.test.train.data
        data.test.target = train_test_data.test.train.target

    ## convert document to matrix
    data.train.bow = vct.fit_transform(data.train.data)
    data.test.bow = vct.transform(data.test.data)

    #### EXPERT CLASSIFIER: ORACLE
    print("Training Oracle expert")
    exp_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty)

    if not args.fulloracle:
        print "Training expert documents:%s" % len(expert_data.oracle.train.data)
        labels, sent_train = experiment_utils.split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=args.limit)

        expert_data.oracle.train.data = sent_train
        expert_data.oracle.train.target = np.array(labels)
        expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)

        exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
    else:
        # expert_data.data = np.concatenate((data.train.data, data.test.data))
        # expert_data.target = np.concatenate((data.train.target, data.test.target))
        expert_data.data =data.train.data
        expert_data.target = data.train.target
        expert_data.target_names = data.train.target_names
        labels, sent_train = experiment_utils.split_data_sentences(expert_data, sent_detector, vct, limit=args.limit)
        expert_data.bow = vct.transform(sent_train)
        expert_data.target = labels
        expert_data.data = sent_train
        exp_clf.fit(expert_data.bow, expert_data.target)

    if "neutral" in args.expert:
        expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                             cost_function=cost_model.cost_function)
    elif "true" in args.expert:
        expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function)
    elif "pred" in args.expert:
        expert = baseexpert.PredictingExpert(exp_clf,  #threshold=args.neutral_threshold,
                                             cost_function=cost_model.cost_function)
    elif "human" in args.expert:
        expert = baseexpert.HumanExpert(", ".join(["{}={}".format(a,b) for a,b in enumerate(data.train.target_names)])+"? > ")
    else:
        raise Exception("We need an expert!")

    print "\nExpert: %s " % expert

    #### EXPERT CLASSIFIER: SENTENCES
    print("Training sentence expert")
    sent_clf = None
    if args.cheating:
        labels, sent_train = experiment_utils.split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=args.limit)

        expert_data.sentence.train.data = sent_train
        expert_data.sentence.train.target = np.array(labels)
        expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)
        sent_clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty)
        sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)

    #### STUDENT CLASSIFIER
    clf = experiment_utils.set_classifier(args.classifier, parameter=args.expert_penalty)

    print "\nStudent Classifier: %s" % clf
    print "\nSentence Classifier: %s" % sent_clf
    print "\nExpert Oracle Classifier: %s" % exp_clf
    print "\nPenalty Oracle:", exp_clf.C
    print "\nVectorizer: %s" % vct
    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          min_size))
    print ("Anytime active learning experiment - use objective function to pick data")
    t0 = time.time()
    tac = []
    tau = []
    ### experiment starts
    for t in range(args.trials):
        trial_accu = []

        trial_aucs = []

        print "*" * 60
        print "Trial: %s" % t

        student = get_student(clf, cost_model, sent_clf, sent_detector, vct)
        student.human_mode = args.expert == 'human'

        print "\nStudent: %s " % student

        train_indices = []
        neutral_data = []  # save the xik vectors
        train_x = []
        train_y = []
        neu_x = []  # data to train the classifier
        neu_y = np.array([])

        pool = Bunch()
        pool.data = data.train.bow.tocsr()  # full words, for training
        pool.text = data.train.data
        pool.target = data.train.target
        pool.predicted = []
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool

        bootstrapped = False
        current_cost = 0
        iteration = 0
        query_index = None
        query_size = None
        oracle_answers = 0
        calibrated=args.calibrate
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:
            util = []

            if not bootstrapped:
                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                query = pool.data[query_index]
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:

                chosen = student.pick_next(pool=pool, step_size=step_size)

                query_index = [x for x, y in chosen]  # document id of chosen instances
                query = [y[0] for x, y in chosen]  # sentence of the document

                query_size = [1] * len(query_index)

            ground_truth = pool.target[query_index]

            if iteration == 0:  ## bootstrap uses ground truth
                labels = ground_truth
                spent = [0] * len(ground_truth)  ## bootstrap cost is ignored
            else:
                # print "ask labels"
                labels = expert.label_instances(query, ground_truth)
                spent = expert.estimate_instances(query_size)

            ### accumulate the cost of the query
            query_cost = np.array(spent).sum()
            current_cost += query_cost

            useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None])

            neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \
                if iteration != 0 else np.array([])

            ## add data recent acquired to train
            if useful_answers.shape[0] != 0:
                train_indices.extend(useful_answers[:, 0])

                # add labels to training
                train_x = pool.data[train_indices]  # # train with all the words

                # update labels with the expert labels
                train_y.extend(useful_answers[:, 1])

            neu_x, neu_y, neutral_data = update_sentence(neutral_data, neu_x, neu_y, labels, query_index, pool, vct)
            # neu_x, neu_y, neutral_data = update_sentence_query(neutral_data, neu_x, neu_y, query, labels)

            if neu_y.shape[0] != neu_x.shape[0]:
                raise Exception("Training data corrupted!")
            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # retrain the model
            current_model = student.train_all(train_x, train_y, neu_x, neu_y)

            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            correct_labels = (np.array(ground_truth) == np.array(labels).reshape(len(labels))).sum()

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tGT:{5}\tneu:{6}\t{7}\tND:{8}\tTD:{9}\t ora_accu:{10}".format(
                len(train_indices),
                accu,
                auc, query_cost,
                current_cost,
                ground_truth,
                len(neutral_answers), neu_y.shape[0], neu_y.sum(), np.array(train_y).sum(), correct_labels))

            ## the results should be based on the cost of the labeling
            if iteration > 0:  # bootstrap iteration

                student.budget -= query_cost  ## Bootstrap doesn't count
                # oracle accuracy (from queries)
                oracle_answers += correct_labels
                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)
                ora_accu[x_axis_range].append(1. * correct_labels)
                ora_cm[x_axis_range].append(metrics.confusion_matrix(ground_truth, labels, labels=np.unique(train_y)))
                lbl_dit[x_axis_range].append(np.sum(train_y))
                # partial trial results
                trial_accu.append([x_axis_range, accu])
                trial_aucs.append([x_axis_range, auc])
                # oracle_accuracies[x_axis_range].append(oracle_answers)
            iteration += 1
            # end of budget loop

        tac.append(trial_accu)
        tau.append(trial_aucs)
        oracle_accuracies.append(1.*oracle_answers / (len(train_indices)-bootstrap_size))
        print "Trial: {}, Oracle right answers: {}, Iteration: {}, Labels:{}, ACCU-OR:{}".format(t, oracle_answers,
                 iteration, len(train_indices)-bootstrap_size,1.*oracle_answers / (len(train_indices)-bootstrap_size))
        #end trial loop
    if args.cost_function not in "uniform":
        accuracies = experiment_utils.extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size)
        aucs = experiment_utils.extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size)
    print "\nAverage oracle accuracy: ", np.array(oracle_accuracies).mean()
    print("Elapsed time %.3f" % (time.time() - t0))
    cheating = "CHEATING" if args.cheating else "NOCHEAT"
    experiment_utils.print_extrapolated_results(accuracies, aucs, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student)
    experiment_utils.oracle_accuracy(ora_accu, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student, cm=ora_cm, num_trials=args.trials)
Exemple #27
0
Launch a cross-validation on three models to see which is best on custom data from:
    https://archive.ics.uci.edu/ml/datasets.html

Chosen data: https://archive.ics.uci.edu/ml/datasets/Leaf
Tested models: SVC, RandomForestClassifier, DecisionTreeClassifier.

Author: Claudio Sousa, David Gonzalez
"""

from sklearn import datasets
from cross_validation import cross_validate, plot_validation, output_csv, normalise_data
from models import instanciate_svc_model, instanciate_randomforest_model, instanciate_decisiontree_model
import numpy as np
from sklearn.datasets.base import Bunch
import pandas as pd

csv = pd.read_csv("../data/leaf.csv")
data = Bunch(data=np.array([list(d[1:]) for d in csv.values]),
             target=np.array([d[0] for d in csv.values]))
data.data = normalise_data(data.data)

models = [
    instanciate_svc_model(),
    instanciate_randomforest_model(1, 11),
    instanciate_decisiontree_model(1, 11)
]

best_model = cross_validate(data, models, 5, 10)
output_csv(models, best_model, "leaf")
plot_validation(models, best_model)
Exemple #28
0
def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
    vct_analizer = vct.build_tokenizer()
    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = max(100, args.fixk)

    fixk_saved = "{0}{1}.p".format(args.train, args.fixk)

    try:
        fixk_file = open(fixk_saved, "rb")
        data = pickle.load(fixk_file)
    except IOError:
        data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5)
        fixk_file = open(fixk_saved, "wb")
        pickle.dump(data, fixk_file)

    # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    parameters = parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__


    #### STUDENT CLASSIFIER
    clf = linear_model.LogisticRegression(penalty="l1", C=1)
    print "\nStudent Classifier: %s" % clf

    #### EXPERT CLASSIFIER

    exp_clf = linear_model.LogisticRegression(penalty='l1', C=.3)
    exp_clf.fit(data.test.bow, data.test.target)
    expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                         cost_function=cost_model.cost_function)
    print "\nExpert: %s " % expert

    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          min_size))
    print ("Cheating experiment - use full uncertainty query k words")
    t0 = time.time()
    ### experiment starts
    tx =[]
    tac = []
    tau = []
    for t in range(args.trials):
        trial_accu =[]

        trial_aucs = []

        trial_x_axis = []
        print "*" * 60
        print "Trial: %s" % t

        student = randomsampling.UncertaintyLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t)
        print "\nStudent: %s " % student
        train_indices = []
        train_x = []
        train_y = []
        pool = Bunch()
        pool.data = data.train.bow.tocsr()   # full words, for training
        pool.fixk = data.train.bowk.tocsr()  # k words BOW for querying
        pool.target = data.train.target
        pool.predicted = []
        pool.kwords = np.array(data.train.kwords)  # k words
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool

        bootstrapped = False

        current_cost = 0
        iteration = 0
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:

            if not bootstrapped:
                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:
                query_index = student.pick_next(pool=pool, k=step_size)

            query = pool.fixk[query_index]  # query with k words

            query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]]

            ground_truth = pool.target[query_index]
            #labels, spent = expert.label(unlabeled=query, target=ground_truth)
            if iteration == 0: ## bootstrap uses ground truth
                labels = ground_truth
                spent = [0] * len(ground_truth) ## bootstrap cost is ignored
            else:
                labels = expert.label_instances(query, ground_truth)
                spent = expert.estimate_instances(query_size)


            ## add data recent acquired to train
            ## CHANGE: if label is not useful, ignore and do not charge money for it
            useful_answers = np.array([[x, y, z] for x, y, z in zip(query_index, labels, spent) if y is not None])

            # train_indices.extend(query_index)
            if useful_answers.shape[0] != 0:
                train_indices.extend(useful_answers[:, 0])

                # add labels to training
                train_x = pool.data[train_indices]  ## train with all the words

                # update labels with the expert labels
                train_y.extend(useful_answers[:, 1])

                #count for cost
                ### accumulate the cost of the query
                # query_cost = np.array(spent).sum()
                # current_cost += query_cost
                query_cost = useful_answers[:, 2]
                query_cost = np.sum(query_cost)
                current_cost += query_cost

            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # retrain the model
            current_model = student.train(train_x, train_y)

            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}".format(len(train_indices),
                                                                                            accu,
                                                                                            auc, query_cost,
                                                                                            current_cost, spent))

            ## the results should be based on the cost of the labeling
            if iteration > 0:   # bootstrap iteration

                student.budget -= query_cost ## Bootstrap doesn't count

                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)

                ## partial trial results

                trial_accu.append([x_axis_range, accu])
                trial_aucs.append([x_axis_range, auc])
            iteration += 1

        # end of budget loop

        tac.append(trial_accu)
        tau.append(trial_aucs)
    #end trial loop

    accuracies = extrapolate_trials(tac)
    aucs = extrapolate_trials(tau)

    print("Elapsed time %.3f" % (time.time() - t0))
    print_extrapolated_results(accuracies, aucs)
Exemple #29
0
def main():
    accuracies = defaultdict(lambda: [])

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
    vct_analizer = vct.build_tokenizer()

    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    min_size = max(100, args.fixk)

    if args.fixk < 0:
        args.fixk = None

    fixk_saved = "{0}{1}.p".format(args.train, args.fixk)

    try:
        print "Loading existing file... %s " % args.train
        fixk_file = open(fixk_saved, "rb")
        data = pickle.load(fixk_file)
        fixk_file.close()
        vectorizer = open("{0}vectorizer.p".format(args.train), "rb")
        vct = pickle.load(vectorizer)
        vectorizer.close()
    except (IOError, ValueError):
        print "Loading from scratch..."
        data = load_dataset(args.train, args.fixk, categories[0], vct, min_size, percent=.5)
        fixk_file = open(fixk_saved, "wb")
        pickle.dump(data, fixk_file)
        fixk_file.close()
        vectorizer = open("{0}vectorizer.p".format(args.train), "wb")
        pickle.dump(vct, vectorizer)
        vectorizer.close()

    # data = load_dataset(args.train, args.fixk, categories[0], vct, min_size)

    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    parameters = parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__

    #### STUDENT CLASSIFIER
    clf = linear_model.LogisticRegression(penalty="l1", C=1)
    # clf = set_classifier(args.classifier)
    print "\nStudent Classifier: %s" % clf

    #### EXPERT CLASSIFIER

    exp_clf = linear_model.LogisticRegression(penalty='l1', C=args.expert_penalty)
    exp_clf.fit(data.test.bow, data.test.target)
    expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                         cost_function=cost_model.cost_function)
    print "\nExpert: %s " % expert

    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          min_size))
    print ("Anytime active learning experiment - use objective function to pick data")
    t0 = time.time()
    tac = []
    tau = []
    ### experiment starts
    for t in range(args.trials):
        trial_accu = []

        trial_aucs = []

        print "*" * 60
        print "Trial: %s" % t
        if args.student in "anyunc":
            student = randomsampling.AnytimeLearner(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
                                                    subpool=250, cost_model=cost_model)
        elif args.student in "lambda":
            student = randomsampling.AnytimeLearnerDiff(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
                                                    subpool=250, cost_model=cost_model, lambda_value=args.lambda_value)
        elif args.student in "anyzero":
            student = randomsampling.AnytimeLearnerZeroUtility(model=clf, accuracy_model=None, budget=args.budget, seed=t, vcn=vct,
                                                    subpool=250, cost_model=cost_model)
        else:
            raise ValueError("Oops! We do not know that anytime strategy. Try again.")

        print "\nStudent: %s " % student
        train_indices = []
        neutral_text = []  # save the raw text of the queries
        neutral_data = []  # save the xik vectors
        train_x = []
        train_y = []
        neu_x = [] # data to train the classifier
        neu_y = np.array([])

        pool = Bunch()
        pool.data = data.train.bow.tocsr()   # full words, for training
        pool.text = data.train.data
        # pool.fixk = data.train.bowk.tocsr()  # k words BOW for querying
        pool.target = data.train.target
        pool.predicted = []
        # pool.kwords = np.array(data.train.kwords)  # k words
        pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool

        bootstrapped = False

        current_cost = 0
        iteration = 0
        query_index = None
        query_size = None
        while 0 < student.budget and len(pool.remaining) > step_size and iteration <= args.maxiter:
            util = []
            if not bootstrapped:
                ## random from each bootstrap
                bt = randomsampling.BootstrapFromEach(t * 10)

                query_index = bt.bootstrap(pool=pool, k=bootstrap_size)
                bootstrapped = True
                query = pool.data[query_index]
                print "Bootstrap: %s " % bt.__class__.__name__
                print
            else:
                # print "pick instance"

                ## chose returns: index, k
                ## util returns: utility, k, unc
                query_chosen, util = student.pick_next(pool=pool, step_size=step_size)
                query_index = [a for a, b in query_chosen]
                query_size = [b for a, b in query_chosen]

                # query = pool.fixk[query_index]  # query with k words
                qk = []
                for q, k in query_chosen:
                    qk.append(" ".join(vct_analizer(pool.text[q])[0:int(k)]))
                query = vct.transform(qk)

            # query_size = [len(vct_analizer(x)) for x in pool.kwords[query_index]]

            ground_truth = pool.target[query_index]
            #labels, spent = expert.label(unlabeled=query, target=ground_truth)
            if iteration == 0: ## bootstrap uses ground truth
                labels = ground_truth
                spent = [0] * len(ground_truth) ## bootstrap cost is ignored
            else:
                # print "ask labels"
                labels = expert.label_instances(query, ground_truth)
                spent = expert.estimate_instances(query_size)

            ### accumulate the cost of the query
            query_cost = np.array(spent).sum()
            current_cost += query_cost
            # print query_index
            useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None])
            neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \
                if iteration != 0 else np.array([])

            # print labels
            # print "label\tutility\tk\tunc"
            # print format_query(zip(labels, util))

            ## add data recent acquired to train
            if useful_answers.shape[0] != 0:
                # print "get training"
                # train_indices.extend(query_index)
                train_indices.extend(useful_answers[:, 0])

                # add labels to training
                train_x = pool.data[train_indices]  # # train with all the words

                # update labels with the expert labels
                #train_y = pool.target[train_indices]
                train_y.extend(useful_answers[:, 1])

            if neutral_answers.shape[0] != 0:
                # current query neutrals
                qlbl = []

                for xik, lbl in zip(query, labels):
                    # neutral_data.append(xik)
                    if isinstance(neutral_data, list):
                        neutral_data = xik
                    else:
                        neutral_data = vstack([neutral_data, xik], format='csr')
                    qlbl.append(neutral_label(lbl))

                ## append the labels of the current query
                neu_y = np.append(neu_y, qlbl)
                neu_x = neutral_data
                #end usefulanswers


            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # remove labels from pool
            pool.remaining.difference_update(query_index)

            # retrain the model
            # current_model = student.train(train_x, train_y)
            # print "train models"
            current_model = student.train_all(train_x, train_y, neu_x, neu_y)
            # print "evaluate"
            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            accu = metrics.accuracy_score(data.test.target, pred_y)

            print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tSpent:{5}\tneu:{6}\t{7}".format(
                len(train_indices),
                accu,
                auc, query_cost,
                current_cost,
                format_spent(spent),
                len(neutral_answers), neu_y.shape[0]))

            ## the results should be based on the cost of the labeling
            if iteration > 0:   # bootstrap iteration

                student.budget -= query_cost ## Bootstrap doesn't count

                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)
                # partial trial results
                trial_accu.append([x_axis_range, accu])
                trial_aucs.append([x_axis_range, auc])

            iteration += 1
            # end of budget loop

        tac.append(trial_accu)
        tau.append(trial_aucs)
        #end trial loop
    if args.cost_function not in "uniform":
        accuracies = extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size)
        aucs = extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size)

    print("Elapsed time %.3f" % (time.time() - t0))
    print_extrapolated_results(accuracies, aucs)