Esempio n. 1
0
def conduct_auto_sklearn(experimentdatafile,resamplingstrategy,resamplingstrategyarguments,timefortask):

    # Load data files
    lunchbox = pickle.load(open(experimentdatafile,"rb"))

    # Set up autosklearn and run against Pythia feature sets
    clf = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=timefortask, per_run_time_limit=360, \
                     initial_configurations_via_metalearning=25, ensemble_size=50, ensemble_nbest=50, seed=1, \
                     ml_memory_limit=3000, include_estimators=None, include_preprocessors=None, \
                     resampling_strategy=resamplingstrategy, resampling_strategy_arguments=resamplingstrategyarguments, \
                     tmp_folder=None, output_folder=None, delete_tmp_folder_after_terminate=True, \
                     delete_output_folder_after_terminate=True, shared_mode=False)
    
    # TODO Find way to suppress voluminous INFO messages from autosklearn
    clf.fit(numpy.asarray(lunchbox['train_data']), numpy.asarray(lunchbox['train_target']))

    # Print autoasklearn results
    print("Models",clf.show_models(), file=sys.stderr)

    # Get performance metrics of autolearn models against testing data
    predictions = clf.predict(numpy.asarray(lunchbox['test_data']))
    performresults = performance_metrics.get_perform_metrics(numpy.asarray(lunchbox['test_target']), predictions)

    # Fill results dictionary to return to Sacred for logging
    results = dict()
    results['autosklearn_models'] = clf.show_models() 
    results['autosklearn_perform_results'] = performresults
    results['directory'] = lunchbox['directory']
    results['features'] = lunchbox['features']
    results['algorithms'] = lunchbox['algorithms']
    results['parameters'] = lunchbox['parameters'] 

    return results
Esempio n. 2
0
def run_model(train_data, train_labels, test_data, test_labels):
    '''
    Algorithm which will take in a set of training text and labels to train a bag of words model
    This model is then used with a logistic regression algorithm to predict the labels for a second set of text
    Method modified from code available at:
    https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words
    Args:
        train_data_text: Text training set.  Needs to be iterable
        train_labels: Training set labels
        test_data_text: The text to
    Returns:
        pred_labels: The predicted labels as determined by logistic regression
    '''

    #use Logistic Regression to train a model
    logreg = linear_model.LogisticRegression(C=1e5)
    
    # we create an instance of Neighbours Classifier and fit the data.
    logreg.fit(train_data, train_labels)
    
    #Now that we have something trained we can check if it is accurate with the test set
    pred_labels = logreg.predict(test_data)
    perform_results = performance_metrics.get_perform_metrics(test_labels, pred_labels)
    
    #Perform_results is a dictionary, so we should add other pertinent information to the run
    perform_results['vector'] = 'Bag_of_Words'
    perform_results['alg'] = 'Logistic_Regression'

    return pred_labels, perform_results
Esempio n. 3
0
def run_model(train_data, train_labels, test_data, test_labels):
    '''
    Algorithm which will take in a set of training text and labels to train a bag of words model
    This model is then used with a logistic regression algorithm to predict the labels for a second set of text
    Method modified from code available at:
    https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words
    Args:
        train_data_text: Text training set.  Needs to be iterable
        train_labels: Training set labels
        test_data_text: The text to
    Returns:
        pred_labels: The predicted labels as determined by logistic regression
    '''

    #use Logistic Regression to train a model
    logreg = linear_model.LogisticRegression(C=1e5)

    # we create an instance of Neighbours Classifier and fit the data.
    logreg.fit(train_data, train_labels)

    #Now that we have something trained we can check if it is accurate with the test set
    pred_labels = logreg.predict(test_data)
    perform_results = performance_metrics.get_perform_metrics(
        test_labels, pred_labels)

    #Perform_results is a dictionary, so we should add other pertinent information to the run
    perform_results['vector'] = 'Bag_of_Words'
    perform_results['alg'] = 'Logistic_Regression'

    return pred_labels, perform_results
Esempio n. 4
0
def predicter(classifier, test_data, test_labels):
    # Handle HDF5 case
    if type(test_data) is str:
        assert test_data==test_labels
        with h5py.File(test_data) as f:
            test_data = f['data'][()]
            test_labels = f['labels'][()]
            
    pred_labels = classifier.predict(test_data)
    perform_results = performance_metrics.get_perform_metrics(test_labels, pred_labels)
    return pred_labels, perform_results
Esempio n. 5
0
def predicter(classifier, test_data, test_labels):
    # Handle HDF5 case
    if type(test_data) is str:
        assert test_data == test_labels
        with h5py.File(test_data) as f:
            test_data = f['data'][()]
            test_labels = f['labels'][()]

    pred_labels = classifier.predict(test_data)
    perform_results = performance_metrics.get_perform_metrics(
        test_labels, pred_labels)
    return pred_labels, perform_results
Esempio n. 6
0
def conduct_auto_sklearn(experimentdatafile, resamplingstrategy,
                         resamplingstrategyarguments, timefortask):

    # Load data files
    lunchbox = pickle.load(open(experimentdatafile, "rb"))

    # Set up autosklearn and run against Pythia feature sets
    clf = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=timefortask, per_run_time_limit=360, \
                     initial_configurations_via_metalearning=25, ensemble_size=50, ensemble_nbest=50, seed=1, \
                     ml_memory_limit=3000, include_estimators=None, include_preprocessors=None, \
                     resampling_strategy=resamplingstrategy, resampling_strategy_arguments=resamplingstrategyarguments, \
                     tmp_folder=None, output_folder=None, delete_tmp_folder_after_terminate=True, \
                     delete_output_folder_after_terminate=True, shared_mode=False)

    # TODO Find way to suppress voluminous INFO messages from autosklearn
    clf.fit(numpy.asarray(lunchbox['train_data']),
            numpy.asarray(lunchbox['train_target']))

    # Print autoasklearn results
    print("Models", clf.show_models(), file=sys.stderr)

    # Get performance metrics of autolearn models against testing data
    predictions = clf.predict(numpy.asarray(lunchbox['test_data']))
    performresults = performance_metrics.get_perform_metrics(
        numpy.asarray(lunchbox['test_target']), predictions)

    # Fill results dictionary to return to Sacred for logging
    results = dict()
    results['autosklearn_models'] = clf.show_models()
    results['autosklearn_perform_results'] = performresults
    results['directory'] = lunchbox['directory']
    results['features'] = lunchbox['features']
    results['algorithms'] = lunchbox['algorithms']
    results['parameters'] = lunchbox['parameters']

    return results
Esempio n. 7
0
def do_epoch(dmn, mode, epoch, batch_size, log_every, skipped=0):
        '''
            This function runs the epochs for the training of the neural network. It calls the steps in the epoch
        and allows for the tweaking of the parameter values.

        Args:
            mode (str): 'train' or 'test' for whether this is the training or testing step
            epoch (int): the epoch number currently on
            skipped (int): how many steps have been skipped because of no change in gradient
        Returns:
            avg_loss (double): the new calculated average loss
            skipped (int): how many steps skipped since the beginning of running the epoch added to the previous skip value
        '''
        # mode is 'train' or 'test'
        y_true = []
        y_pred = []
        avg_loss = 0.0
        prev_time = time.time()

        batches_per_epoch = dmn.get_batches_per_epoch(mode)
        #if batches_per_epoch==0: batches_per_epoch=10
        print(batches_per_epoch, dmn)

        for i in range(0, batches_per_epoch):
            step_data = dmn.step(i, mode) # Run step using the dynamic memory network object
            prediction = step_data["prediction"]
            answers = step_data["answers"]
            current_loss = step_data["current_loss"]
            current_skip = (step_data["skipped"] if "skipped" in step_data else 0)
            log = step_data["log"]

            skipped += current_skip

            if current_skip == 0:
                avg_loss += current_loss

                for x in answers:
                    y_true.append(x)

                for x in prediction.argmax(axis=1):
                    #some predictions are not 0,1 for the first couple of guesses
                    #TODO figure out why...but until then this catches the issue
                    if x not in [0,1]:
                        x = np.random.randint(0,2)
                    y_pred.append(x)

                # TODO: save the state sometimes
                if (i % log_every == 0):
                    cur_time = time.time()
                    print ("  %sing: %d.%d / %d \t loss: %.3f \t avg_loss: %.3f \t skipped: %d \t %s \t time: %.2fs" %
                        (mode, epoch, i * batch_size, batches_per_epoch * batch_size,
                         current_loss, avg_loss / (i + 1), skipped, log, cur_time - prev_time))
                    prev_time = cur_time

            if np.isnan(current_loss):
                print("==> current loss IS NaN. This should never happen :) " )
                exit()

        avg_loss /= batches_per_epoch
        print("\n  %s loss = %.5f" % (mode, avg_loss))
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_true, y_pred))

        perform_results = performance_metrics.get_perform_metrics(y_true, y_pred)
        print(perform_results)

        accuracy = sum([1 if t == p else 0 for t, p in zip(y_true, y_pred)])
        print("accuracy: %.2f percent" % (accuracy * 100.0 / batches_per_epoch / batch_size))

        return avg_loss, skipped, perform_results, y_pred
Esempio n. 8
0
def predicter(classifier, test_data, test_labels):
    pred_labels = classifier.predict(test_data)
    perform_results = performance_metrics.get_perform_metrics(
        test_labels, pred_labels)
    return pred_labels, perform_results
Esempio n. 9
0
def predicter(classifier, test_data, test_labels):
    pred_labels = classifier.predict(test_data)
    perform_results = performance_metrics.get_perform_metrics(test_labels, pred_labels)
    return pred_labels, perform_results