def q_2_4(): print("******RUNNING TITANIC DATA SET*****") data, test_data, feature_names, class_names = load_titanic_data() data = preprocess_titanic(data, True) perm = np.random.RandomState(seed=20).permutation((data.shape[0])) data = data[perm] data, valid = data[:800], data[800:] idy = data.shape[1] - 1 type_map, categories_map = gen_maps(data) classifier = DecisionTree(type_map, categories_map) classifier.fit(data, 4, 10) train_predictions = classifier.predict(data) train_actual = extract_column(data, idy) valid_predictions = classifier.predict(valid) valid_actual = extract_column(valid, idy) print("Decision Tree training Accuracies: ", error_rate(train_predictions, train_actual)) print("Decision Tree Validation Accuracies: ", error_rate(valid_predictions, valid_actual)) classifier = RandomForest(300, 300, 2, type_map, categories_map, 20) classifier.fit(data, 10, 10) train_predictions = classifier.predict(data) train_actual = extract_column(data, idy) valid_predictions = classifier.predict(valid) valid_actual = extract_column(valid, idy) print("Random Forest training Accuracies: ", error_rate(train_predictions, train_actual)) print("Random Forest Validation Accuracies: ", error_rate(valid_predictions, valid_actual))
def gini_purification(data, idx, categorical, thresholds): """ Input: data (last column is label column), feature to split on, whether or not the split is categorical, list of thresholds or categories Output: the change in gini-impurity, lower (negative) is good. """ idy = data.shape[1] - 1 y = extract_column(data, idy) x = extract_column(data, idx) gini_before = DecisionTree.gini_impurity(y) gini_after = 0 # joined represents two columns, left is idx feature, right is the label joined = np.hstack((x, y)) total = len(joined) if categorical: for category in thresholds: split_data = joined[joined[:, 0] == category] new_y = extract_column(split_data, 1) gini_after += DecisionTree.gini_impurity(new_y) * len(new_y) else: for thresh in thresholds: split_data_below = joined[joined[:, 0] < thresh] joined = joined[joined[:, 0] >= thresh] new_y = extract_column(split_data_below, 1) gini_after += DecisionTree.gini_impurity(new_y) * len(new_y) new_y = extract_column(joined, 1) gini_after += DecisionTree.gini_impurity(new_y) * len(new_y) gini_after /= total return gini_before - gini_after
def image_delete_and_containers(image): """ WARNING: This will remove an image and all its dependant containers """ for container in utils.extract_column(utils.filter_column(utils.Command('docker ps').stdout, 1, eq=image), -1): container_stop(container) for container in utils.extract_column(utils.filter_column(utils.Command('docker ps -a').stdout, 1, eq=image), -1): container_delete(container) return image_delete(image)
def extract_embeddings_as_features(self, inputfile, embedding_model='word2vec-google-news-300'): ''' Function that extracts features and gold labels using word embeddings :param inputfile: path to inputfile :param token_column: header name of column with tokens :param embedding_model: name of a pretrained word embedding model :type inputfile: string :type token_column: string :type embedding_model: string :return features: list of vector representation of tokens ''' ### This code was partially inspired by code included in the HLT course, obtained from https://github.com/cltl/ma-hlt-labs/, accessed in May 2020. features = [] embedding_model = api.load(embedding_model) tokens = extract_column(inputfile, 'token') for token in tokens: if token in embedding_model: vector = embedding_model[token] else: vector = [0]*300 features.append(vector) return features
def wait_running_process(cmd, container, timeout=1): count, step = timeout, 0.2 while count > 0: if cmd in utils.extract_column(docker_exec('ps -A', container, user='******'), -1, 1): return True time.sleep(step) count -= step
def run_evaluations(data_infofile): ''' Carry out standard evaluation for one or more system outputs :param goldfile: path to file with goldstandard :param data_infofile: filepath to json providing information about system and data to files :type goldfile: string :type data_infofile: string :returns the evaluations for all systems ''' evaluations = {} data = load_json(data_infofile) gold_annotations = extract_column(data['gold']['file'], data['gold']['annotation_column']) for key, value in data.items(): if key != 'training' and key != 'gold': system_annotations = extract_column(value['file'], value['annotation_column']) sys_evaluation = carry_out_evaluation(gold_annotations, system_annotations) evaluations[key] = sys_evaluation return evaluations
def get_networks(filter=None, driver=None): docker_cmd = 'docker network ls' if driver: networks = utils.extract_column(utils.filter_column(utils.Command(docker_cmd).stdout, 2, 1, eq=driver), 1) else: networks = utils.Command(docker_cmd).stdout_column(1, 1) if filter: if isinstance(filter, basestring): return [x for x in networks if filter in x] else: return [x for x in networks if x in filter] return networks
def create_user(user, container, groups=(), home=None, shell=None): """ Create a user with optional groups, home and shell """ cmd = 'useradd {}{}{}'.\ format(user, ' -d {}'.format(home) if home else '', ' -s {}'.format(shell) if shell else '') docker_exec(cmd, container) existing_groups = utils.extract_column(docker_exec('cat /etc/group', container), 0, sep=':') for group in groups: if group not in existing_groups: docker_exec('addgroup {}'.format(group), container) docker_exec('usermod -a -G {} {}'.format(group, user), container)
def kaggle(): data, test_data, feature_names, class_names = load_titanic_data() data = preprocess_titanic(data, True) test = preprocess_titanic(test_data, False) type_map, categories_map = gen_maps(data) classifier = DecisionTree(type_map, categories_map) classifier.fit(data, 4, 10) predictions = classifier.predict(test) pred_train = classifier.predict(data) actual = extract_column(data, 9) print(error_rate(pred_train, actual)) results_to_csv(predictions.flatten()) """
def gen_maps(data): type_map = {} categories_map = {} for i in [0, 1, 7, 8]: type_map[i] = 'categorical' for i in [2, 3, 4, 6]: type_map[i] = 'quantitative' type_map[5] = 'clustered' categories_map[0] = [1, 2, 3] categories_map[1] = ['male', 'female'] categories_map[7] = list(set(extract_column(data, 7).flatten())) categories_map[8] = ['S', 'C', 'Q'] return type_map, categories_map
def get_labels(self, inputfile, annotation_column): """ This function extracts all the labels from the inputfile in the form required by the model :param inputfile: path to inputfile :param annotation_column: indication of column with annotations :type inputfile: string :type annotation_column: string :returns: labels in required form """ # extract the labels in the form required for CRF and return if self.modelname == 'crf': return self.extract_crf_labels(inputfile, annotation_column) # extract and return the labels in the form required for all other models return extract_column(inputfile, annotation_column)
def get_containers(filter=None, image=None, all=True): """ Get containers names, with optional filter on name. :param filter: if string, get containers names containing it, if python container (list, set, ...), get containers in this set. :param image: if string, get containers from this image (ignore filter). :param all: if False, get only running containers, else get all containers. :return: a list of containers names """ docker_cmd = 'docker ps -a' if all else 'docker ps' if image: return utils.extract_column(utils.filter_column(utils.Command(docker_cmd).stdout, 1, eq=image), -1) else: containers = utils.Command(docker_cmd).stdout_column(-1, 1) if filter: if isinstance(filter, basestring): return [x for x in containers if filter in x] else: return [x for x in containers if x in filter] return containers
def preprocess_titanic(data, include_labels): imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent') columns = [0 for _ in range(9)] current_col = extract_column(data, 5).flatten() current_col = [re.sub('[^0-9]', '', ticket) for ticket in current_col] #remove all letters current_col = [float(ticket) if ticket != '' else float(np.nan) for ticket in current_col] #convert to float current_col = np.array([current_col]).T current_col = imp_mean.fit_transform(current_col) columns[5] = current_col for col in [0, 1, 8]: current_col = extract_column(data, col) current_col =imp_mode.fit_transform(current_col) columns[col] = (current_col) for col in [2, 3, 4, 6]: current_col = extract_column(data, col) current_col = imp_mean.fit_transform(current_col) columns[col] = (current_col) current_col = extract_column(data, 7).flatten() current_col = [re.sub('[^A-z]', '', cabin)[0] if isinstance(cabin, str) else cabin for cabin in current_col] class_col = extract_column(data, 0).flatten() current_col = ['B' if not isinstance(current_col[i], str) and class_col[i] == 1 else current_col[i] for i in range(len(current_col))] current_col = ['D' if not isinstance(current_col[i], str) and class_col[i] == 2 else current_col[i] for i in range(len(current_col))] current_col = ['F' if not isinstance(current_col[i], str) and class_col[i] == 3 else current_col[i] for i in range(len(current_col))] current_col = np.array([current_col]).T columns[7] = current_col if include_labels: columns.append(extract_column(data, 9)) return np.hstack(tuple(columns))
def gen_quantitative_thresholds(data, idx): xs = extract_column(data, idx) return [np.mean(xs)]
def get_processes(container, filter=None): processes = utils.extract_column(docker_exec('ps -A', container, user='******'), -1, 1) if filter is None: return processes return [proc for proc in processes if filter in proc]
def get_version(app, container): output = docker_exec('apt-cache policy {}'.format(app), container, user='******') try: return utils.extract_column(utils.filter_column(output, 0, startswith='Install'), 1, sep=':')[0] except IndexError: pass