def q_2_4():
    print("******RUNNING TITANIC DATA SET*****")

    data, test_data, feature_names, class_names = load_titanic_data()
    data = preprocess_titanic(data, True)

    perm = np.random.RandomState(seed=20).permutation((data.shape[0]))
    data = data[perm]
    data, valid = data[:800], data[800:]
    idy = data.shape[1] - 1

    type_map, categories_map = gen_maps(data)
    classifier = DecisionTree(type_map, categories_map)
    classifier.fit(data, 4, 10)
    train_predictions = classifier.predict(data)
    train_actual = extract_column(data, idy)
    valid_predictions = classifier.predict(valid)
    valid_actual = extract_column(valid, idy)

    print("Decision Tree training Accuracies:       ",
          error_rate(train_predictions, train_actual))
    print("Decision Tree Validation Accuracies:    ",
          error_rate(valid_predictions, valid_actual))

    classifier = RandomForest(300, 300, 2, type_map, categories_map, 20)
    classifier.fit(data, 10, 10)
    train_predictions = classifier.predict(data)
    train_actual = extract_column(data, idy)
    valid_predictions = classifier.predict(valid)
    valid_actual = extract_column(valid, idy)

    print("Random Forest training Accuracies:       ",
          error_rate(train_predictions, train_actual))
    print("Random Forest Validation Accuracies:    ",
          error_rate(valid_predictions, valid_actual))
Exemple #2
0
    def gini_purification(data, idx, categorical, thresholds):
        """
        Input: data (last column is label column), feature to split on, whether or not the split is categorical,
            list of thresholds or categories
        Output: the change in gini-impurity, lower (negative) is good.
        """
        idy = data.shape[1] - 1
        y = extract_column(data, idy)
        x = extract_column(data, idx)
        gini_before = DecisionTree.gini_impurity(y)
        gini_after = 0

        # joined represents two columns, left is idx feature, right is the label
        joined = np.hstack((x, y))
        total = len(joined)

        if categorical:
            for category in thresholds:
                split_data = joined[joined[:, 0] == category]
                new_y = extract_column(split_data, 1)
                gini_after += DecisionTree.gini_impurity(new_y) * len(new_y)
        else:
            for thresh in thresholds:
                split_data_below = joined[joined[:, 0] < thresh]
                joined = joined[joined[:, 0] >= thresh]
                new_y = extract_column(split_data_below, 1)
                gini_after += DecisionTree.gini_impurity(new_y) * len(new_y)
            new_y = extract_column(joined, 1)
            gini_after += DecisionTree.gini_impurity(new_y) * len(new_y)
        gini_after /= total

        return gini_before - gini_after
Exemple #3
0
def image_delete_and_containers(image):
    """ WARNING: This will remove an image and all its dependant containers
    """
    for container in utils.extract_column(utils.filter_column(utils.Command('docker ps').stdout, 1, eq=image), -1):
        container_stop(container)
    for container in utils.extract_column(utils.filter_column(utils.Command('docker ps -a').stdout, 1, eq=image), -1):
        container_delete(container)
    return image_delete(image)
    def extract_embeddings_as_features(self, inputfile, embedding_model='word2vec-google-news-300'):
        '''
        Function that extracts features and gold labels using word embeddings
        
        :param inputfile: path to inputfile
        :param token_column: header name of column with tokens
        :param embedding_model: name of a pretrained word embedding model
        :type inputfile: string
        :type token_column: string
        :type embedding_model: string
        
        :return features: list of vector representation of tokens
        '''
        ### This code was partially inspired by code included in the HLT course, obtained from https://github.com/cltl/ma-hlt-labs/, accessed in May 2020.
        features = []
        embedding_model = api.load(embedding_model)
        tokens = extract_column(inputfile, 'token')
        
        for token in tokens:
            if token in embedding_model:
                vector = embedding_model[token]
            else:
                vector = [0]*300
            features.append(vector)

        return features
def wait_running_process(cmd, container, timeout=1):
    count, step = timeout, 0.2
    while count > 0:
        if cmd in utils.extract_column(docker_exec('ps -A', container, user='******'), -1, 1):
            return True
        time.sleep(step)
        count -= step
Exemple #6
0
def run_evaluations(data_infofile):
    '''
    Carry out standard evaluation for one or more system outputs
    
    :param goldfile: path to file with goldstandard
    :param data_infofile: filepath to json providing information about system and data to files
    :type goldfile: string
    :type data_infofile: string
    
    :returns the evaluations for all systems
    '''
    evaluations = {}
    data = load_json(data_infofile)
    gold_annotations = extract_column(data['gold']['file'], data['gold']['annotation_column'])
    for key, value in data.items():
        if key != 'training' and key != 'gold':
            system_annotations = extract_column(value['file'], value['annotation_column'])
            sys_evaluation = carry_out_evaluation(gold_annotations, system_annotations)
            evaluations[key] = sys_evaluation
    return evaluations
def get_networks(filter=None, driver=None):
    docker_cmd = 'docker network ls'
    if driver:
        networks = utils.extract_column(utils.filter_column(utils.Command(docker_cmd).stdout, 2, 1, eq=driver), 1)
    else:
        networks = utils.Command(docker_cmd).stdout_column(1, 1)
    if filter:
        if isinstance(filter, basestring):
            return [x for x in networks if filter in x]
        else:
            return [x for x in networks if x in filter]
    return networks
def create_user(user, container, groups=(), home=None, shell=None):
    """ Create a user with optional groups, home and shell
    """
    cmd = 'useradd {}{}{}'.\
        format(user,
               ' -d {}'.format(home) if home else '',
               ' -s {}'.format(shell) if shell else '')
    docker_exec(cmd, container)
    existing_groups = utils.extract_column(docker_exec('cat /etc/group', container), 0, sep=':')
    for group in groups:
        if group not in existing_groups:
            docker_exec('addgroup {}'.format(group), container)
        docker_exec('usermod -a -G {} {}'.format(group, user), container)
def kaggle():
    data, test_data, feature_names, class_names = load_titanic_data()
    data = preprocess_titanic(data, True)
    test = preprocess_titanic(test_data, False)

    type_map, categories_map = gen_maps(data)
    classifier = DecisionTree(type_map, categories_map)

    classifier.fit(data, 4, 10)
    predictions = classifier.predict(test)
    pred_train = classifier.predict(data)
    actual = extract_column(data, 9)
    print(error_rate(pred_train, actual))
    results_to_csv(predictions.flatten())
    """
Exemple #10
0
def gen_maps(data):
    type_map = {}
    categories_map = {}

    for i in [0, 1, 7, 8]:
        type_map[i] = 'categorical'
    for i in [2, 3, 4, 6]:
        type_map[i] = 'quantitative'
    type_map[5] = 'clustered'

    categories_map[0] = [1, 2, 3]
    categories_map[1] = ['male', 'female']
    categories_map[7] = list(set(extract_column(data, 7).flatten()))
    categories_map[8] = ['S', 'C', 'Q']
    return type_map, categories_map
    def get_labels(self, inputfile, annotation_column):
        """
        This function extracts all the labels from the inputfile in the form required by the model

        :param inputfile: path to inputfile
        :param annotation_column: indication of column with annotations
        :type inputfile: string
        :type annotation_column: string
        
        :returns: labels in required form
        """

        # extract the labels in the form required for CRF and return
        if self.modelname == 'crf':
            return self.extract_crf_labels(inputfile, annotation_column)

        # extract and return the labels in the form required for all other models
        return extract_column(inputfile, annotation_column)
def get_containers(filter=None, image=None, all=True):
    """ Get containers names, with optional filter on name.
    :param filter: if string, get containers names containing it, if python container (list, set, ...),
           get containers in this set.
    :param image: if string, get containers from this image (ignore filter).
    :param all: if False, get only running containers, else get all containers.
    :return: a list of containers names
    """
    docker_cmd = 'docker ps -a' if all else 'docker ps'
    if image:
        return utils.extract_column(utils.filter_column(utils.Command(docker_cmd).stdout, 1, eq=image), -1)
    else:
        containers = utils.Command(docker_cmd).stdout_column(-1, 1)
        if filter:
            if isinstance(filter, basestring):
                return [x for x in containers if filter in x]
            else:
                return [x for x in containers if x in filter]
        return containers
Exemple #13
0
def preprocess_titanic(data, include_labels):
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

    columns = [0 for _ in range(9)]

    current_col = extract_column(data, 5).flatten()
    current_col = [re.sub('[^0-9]', '', ticket) for ticket in current_col]                      #remove all letters
    current_col = [float(ticket) if ticket != '' else float(np.nan) for ticket in current_col]  #convert to float
    current_col = np.array([current_col]).T
    current_col = imp_mean.fit_transform(current_col)
    columns[5] = current_col

    for col in [0, 1, 8]:
        current_col = extract_column(data, col)
        current_col =imp_mode.fit_transform(current_col)
        columns[col] = (current_col)
    for col in [2, 3, 4, 6]:
        current_col = extract_column(data, col)
        current_col = imp_mean.fit_transform(current_col)
        columns[col] = (current_col)

    current_col = extract_column(data, 7).flatten()
    current_col = [re.sub('[^A-z]', '', cabin)[0] if isinstance(cabin, str) else cabin for cabin in current_col]
    class_col = extract_column(data, 0).flatten()
    current_col = ['B' if not isinstance(current_col[i], str) and class_col[i] == 1 else current_col[i] for i in
                   range(len(current_col))]
    current_col = ['D' if not isinstance(current_col[i], str) and class_col[i] == 2 else current_col[i] for i in
                   range(len(current_col))]
    current_col = ['F' if not isinstance(current_col[i], str) and class_col[i] == 3 else current_col[i] for i in
                   range(len(current_col))]
    current_col = np.array([current_col]).T
    columns[7] = current_col

    if include_labels:
        columns.append(extract_column(data, 9))
    return np.hstack(tuple(columns))
Exemple #14
0
def gen_quantitative_thresholds(data, idx):
    xs = extract_column(data, idx)
    return [np.mean(xs)]
def get_processes(container, filter=None):
    processes = utils.extract_column(docker_exec('ps -A', container, user='******'), -1, 1)
    if filter is None:
        return processes
    return [proc for proc in processes if filter in proc]
def get_version(app, container):
    output = docker_exec('apt-cache policy {}'.format(app), container, user='******')
    try:
        return utils.extract_column(utils.filter_column(output, 0, startswith='Install'), 1, sep=':')[0]
    except IndexError:
        pass