def prepare_features(self): self.pretrain_train_features = utils.generate_features( self.QR_weak_train_path, 'QR_weak', self.QR_weak_train_slice) self.pretrain_dev_features = utils.generate_features( self.QR_weak_dev_path, 'QR_weak', self.QR_weak_dev_slice) self.finetune_train_features = utils.generate_features( self.QR_train_path, 'QR') self.finetune_devs_features = [ utils.generate_features(self.QR_dev_dir + str(i) + '.csv', 'QR') for i in range(7) ]
def index_images(folder, features_path, mapping_path, model, glove_path): print("Now indexing images...") word_vectors = utils.load_glove_vectors(glove_path) _, _, paths = utils.load_paired_img_wrd(folder=folder, word_vectors=word_vectors) images_features, file_index = utils.generate_features(paths, model) utils.save_features(features_path, images_features, mapping_path, file_index) return images_features, file_index
def fit(self, X, y, cv=3): """ Fits wrapper. Parameters ---------- X : array-like, shape (n_features,n_samples) The training input samples. y : array-like, shape (n_features,n_samples) the target values. cv : int Number of folds in cross-validation Returns ------ None See Also -------- Examples -------- """ features_ranks = dict(zip(generate_features(X), self.__measure(X, y))) sorted_features_ranks = OrderedDict( sorted(features_ranks.items(), key=lambda x: x[1])) selected_features = np.array( [feature for feature in sorted_features_ranks]) number_of_features_left_to_remove = self.__n_features__ self.__estimator__.fit(X[:, selected_features], y) accuracy = get_current_cv_accuracy(self.__estimator__, X, y, selected_features, cv) i = 0 self.best_score = accuracy while len(sorted_features_ranks) != i and i < len(selected_features): iteration_features = np.delete(selected_features, i) self.__estimator__.fit(X[:, iteration_features], y) iteration_accuracy = get_current_cv_accuracy( self.__estimator__, X, y, iteration_features, cv) if iteration_accuracy > self.best_score: selected_features = iteration_features number_of_features_left_to_remove -= 1 self.best_score = iteration_accuracy if not number_of_features_left_to_remove: break else: i += 1 self.features__ = selected_features
def fit_criterion_measure(X, y): x = np.asarray(X) # Converting input data to numpy array y = np.asarray(y.reshape((-1, ))) fc = np.zeros( x.shape[1] ) # Array with amounts of correct predictions for each feature tokens_n = np.max(y) + 1 # Number of different class tokens centers = np.empty( tokens_n ) # Array with centers of sets of feature values for each class token variances = np.empty( tokens_n ) # Array with variances of sets of feature values for each class token # Each of arrays above will be separately calculated for each feature distances = np.empty( tokens_n ) # Array with distances between sample's value and each class's center # This array will be separately calculated for each feature and each sample for feature_index, feature in enumerate(x.T): # For each feature # Initializing utility structures class_values = [ [] for _ in range(tokens_n) ] # Array with lists of feature values for each class token for index, value in enumerate(y): # Filling array class_values[value].append(feature[index]) for token, values in enumerate( class_values ): # For each class token's list of feature values tmp_arr = np.array(values) centers[token] = np.mean(tmp_arr) variances[token] = np.var(tmp_arr) # Main calculations for sample_index, value in enumerate( feature): # For each sample value for i in range(tokens_n): # For each class token # Here can be raise warnings by 0/0 division. In this case, default results # are interpreted correctly distances[i] = np.abs(value - centers[i]) / variances[i] fc[feature_index] += np.argmin(distances) == y[sample_index] fc /= y.shape[0] return dict(zip(generate_features(x), fc))
def process_fov(fov_image, segmentation_output, output_dir, objects_per_patch=50, shape="bitmap"): """ Generates the supervisely directory for the given gov. Arguments: fov_image: ndarray 2D Numpy array to the gray scale image segmentation_output: ndarray 2D uint16 The instance segmentation output. One int id per object. output_dir: foldername The location of the output supervisely folder objects_per_patch: int Number of objects per supervisely ROI shape: string ["bitmap", "polygon"] The shape of nucleus in the supervisely project. """ scaled = generate_bmp(fov_image) project_dir = output_dir masks = segmentation_output if os.path.isdir(project_dir): shutil.rmtree(project_dir) os.mkdir(project_dir) ann_dir = os.path.join(project_dir, "ann") image_dir = os.path.join(project_dir, "img") os.mkdir(image_dir) os.mkdir(ann_dir) #Create the template for the project directory meta_file = os.path.join(project_dir, "..", "meta.json") meta_json = generate_project_template(shape) with open(meta_file, 'w') as outfile: json.dump(meta_json, outfile) edges, center_blobs, areas, borders, contours = generate_features(masks) x_dim = masks.shape[0] y_dim = masks.shape[1] grid_size, id_list = calcuate_grid_size(masks, objects_per_patch) #sort the contours based on their centroid. Every ROI will be decomposed of 5 vertical stripes, #then the objects will be sorted by their horizontal index contours = sorted( contours, key=lambda k: [int(k[2][0] / (grid_size / 5)), -1 * k[2][1]]) processed_ids = set([]) x_index = 0 for x_start in range(0, x_dim, grid_size): y_index = 0 for y_start in range(0, y_dim, grid_size): lower_bound = x_start upper_bound = x_start + grid_size left_bound = y_start right_bound = y_start + grid_size if upper_bound > x_dim: upper_bound = x_dim if right_bound > y_dim: right_bound = y_dim roi_ids = list( np.unique(masks[lower_bound:upper_bound, left_bound:right_bound])) if 0 in roi_ids: roi_ids.remove(0) roi_ids_set = set(roi_ids) roi_to_be_processed = list(roi_ids_set.difference(processed_ids)) roi_already_processed = list(roi_ids_set & processed_ids) #to_be_processed = list(id_list & roi_ids_set) #already_processed = region_ids_set.difference(id_list) #update id_list with the remaining ids that will be processed #id_list = id_list.difference(region_ids_set) processed_ids.update(roi_to_be_processed) print("{0}:{1}, {2}:{3} to process {4}, already_processed: {5}". format(lower_bound, upper_bound, left_bound, right_bound, len(roi_to_be_processed), len(roi_already_processed))) if shape == "polygon": contours_to_process = [ cont[0] for cont in contours if cont[1] in roi_to_be_processed ] contours_already_processed = [ cont[0] for cont in contours if cont[1] in roi_already_processed ] object_func = create_polygon elif shape == "bitmap": contours_to_process = [(cont[3], cont[4]) for cont in contours if cont[1] in roi_to_be_processed] contours_already_processed = [ (cont[3], cont[4]) for cont in contours if cont[1] in roi_already_processed ] object_func = create_bitmap bb = (left_bound, upper_bound, right_bound, lower_bound) annotations = create_ann( masks.shape, contours_to_process, object_func, bb=bb, already_processed=contours_already_processed) result_prefix = "{0}_{1}_{2}_{3}_{4}_{5}".format( x_index, y_index, lower_bound, upper_bound, left_bound, right_bound) ann_file_name = result_prefix + ".json" ann_file = os.path.join(ann_dir, ann_file_name) with open(ann_file, 'w') as outfile: json.dump(annotations, outfile) image_file_name = result_prefix + ".bmp" image_file = os.path.join(image_dir, image_file_name) imsave(image_file, scaled) #shutil.copyfile(bmp_image_file, image_file) y_index += 1 x_index += 1
def load_data_features(): """ Loads the data, if the data is not splitted yet the data will be split in a train and val set """ RANDOM_STATE = 123 train_file = Path("data/train.csv") if train_file.exists(): train = pd.read_csv("data/train.csv") val = pd.read_csv("data/val.csv") test = pd.read_csv("data/test.csv") features_train = pd.read_csv("data/features_train.csv") features_val = pd.read_csv("data/features_val.csv") features_test = pd.read_csv("data/features_test.csv") else: # Split normal data train_cola = pd.read_csv("data/SemEval/olid-training-v1.0.tsv", delimiter="\t") test_cola = pd.read_csv("data/SemEval/testset-levela.tsv", delimiter="\t") labels_cola = pd.read_csv("data/SemEval/labels-levela.csv", header=None) labels_cola.columns = ['id', 'subtask_a'] test = pd.merge(test_cola, labels_cola, on='id') # Remove duplicates train_cola = train_cola.drop_duplicates("tweet") test = test.drop_duplicates("tweet") train, val = train_test_split(train_cola, test_size=0.2, random_state=RANDOM_STATE) train.reset_index(drop=True) val.reset_index(drop=True) train = train[["tweet", "subtask_a"]] val = val[["tweet", "subtask_a"]] test = test[["tweet", "subtask_a"]] train.columns = ['text', 'label'] val.columns = ['text', 'label'] test.columns = ['text', 'label'] # Generate features features_train = generate_features(train) features_val = generate_features(val) features_test = generate_features(test) train.to_csv("data/train.csv", index=False) val.to_csv("data/val.csv", index=False) test.to_csv("data/test.csv", index=False) features_train.to_csv("data/features_train.csv") features_val.to_csv("data/features_val.csv") features_test.to_csv("data/features_test.csv") return train, val, test, features_train, features_val, features_test
def load_data(subtask, use_features=False): """ Loads the data, if the data is not splitted yet the data will be split in a train and val set Args: subtask = a, b, c """ subtask_name = "subtask_" + subtask RANDOM_STATE = 123 train_file = Path("data/train_" + subtask + ".csv") if train_file.exists(): train = pd.read_csv("data/train_" + subtask + ".csv") val = pd.read_csv("data/val_" + subtask + ".csv") test = pd.read_csv("data/test_" + subtask + ".csv") if use_features: features_train = pd.read_csv("data/features_train_" + subtask + ".csv") features_val = pd.read_csv("data/features_val_" + subtask + ".csv") features_test = pd.read_csv("data/features_test_" + subtask + ".csv") else: # Split normal data train_cola = pd.read_csv("data/SemEval/olid-training-v1.0.tsv", delimiter="\t") test_cola = pd.read_csv("data/SemEval/testset-level" + subtask + ".tsv", delimiter="\t") labels_cola = pd.read_csv("data/SemEval/labels-level" + subtask + ".csv", header=None) labels_cola.columns = ['id', subtask_name] test = pd.merge(test_cola, labels_cola, on='id') # Remove duplicates train_cola = train_cola.drop_duplicates("tweet") test = test.drop_duplicates("tweet") #Remove nan in a certain column train_cola = train_cola.dropna(subset=[subtask_name]) test = test.dropna(subset=[subtask_name]) train, val = train_test_split(train_cola, test_size=0.2, random_state=123) train.reset_index(drop=True) val.reset_index(drop=True) train = train[["tweet", subtask_name]] val = val[["tweet", subtask_name]] test = test[["tweet", subtask_name]] train.columns = ['text', 'label'] val.columns = ['text', 'label'] test.columns = ['text', 'label'] train.to_csv("data/train_" + subtask + ".csv", index=False) val.to_csv("data/val_" + subtask + ".csv", index=False) test.to_csv("data/test_" + subtask + ".csv", index=False) if use_features: # Generate features features_train = generate_features(train) features_val = generate_features(val) features_test = generate_features(test) features_train.to_csv("data/features_train_" + subtask + ".csv", index=False) features_val.to_csv("data/features_val_" + subtask + ".csv", index=False) features_test.to_csv("data/features_test_" + subtask + ".csv", index=False) return train.head(10), val.head(10), test.head( 10), features_train.head(10), features_val.head( 10), features_test.head(10) return train, val, test
def run(cfg): generate_features(globals(), cfg.overwrite)
def run(self, X, y, feature_names=None): """ X: shape(n_samples, n_features) feature matrix, an array filled with features. y: shape(n_samples, 1) label matrix, used to calculate SU value for each feature. :return: shape(n_samples, self.__n_features) an array filled with the selected features. """ # Calculate the entropy of y. entropy = self.cal_entropy(y) feature_names = generate_features(X, feature_names) # Calculate conditional entropy for each feature. self.feature_scores = dict() for index in range(len(X.T)): dict_i = dict() for i in range(len(X.T[index])): if X.T[index][i] not in dict_i: dict_i.update({X.T[index][i]: [i]}) else: dict_i[X.T[index][i]].append(i) # print(dict_i) # Conditional entropy of a feature. con_entropy = 0.0 # Entropy of each feature entropy_x = self.cal_entropy(X[:, index]) # get corresponding values in y. for f in dict_i.values(): # Probability of each class in a feature. p = len(f) / len(X.T[0]) # Dictionary of corresponding probability in labels. dict_y = dict() for i in f: if y.T[i] not in dict_y: dict_y.update({y.T[i]: 1}) else: dict_y[y.T[i]] += 1 # calculate the probability of corresponding label. sub_entropy = 0.0 for l in dict_y.values(): sub_entropy += -l / sum(dict_y.values()) * math.log( l / sum(dict_y.values()), 2) con_entropy += sub_entropy * p # self._features.append( # {"su": 2 * (entropy - con_entropy) / (entropy_x + entropy), "index": feature_names[index]}) self.feature_scores[feature_names[index]] = 2 * ( entropy - con_entropy) / (entropy_x + entropy) # Sort by symmetric uncertainty in descending order. new_list = list( sorted(self.feature_scores.items(), reverse=True, key=lambda k: k[1])) for item in new_list[self.__n_features:]: X = np.delete(X, [item[1]], axis=1) return X
def run(cfg): # overwriteがfalseなら上書きはされない # globals()からこのファイルの中にある特徴量クラスが選別されてそれぞれ実行される print("Start!") generate_features(globals(), cfg.base.overwrite) print("sucess!")
def run(cfg): # overwriteがfalseなら上書きはされない # globals()からこのファイルの中にある特徴量クラスが選別されてそれぞれ実行される generate_features(globals(), cfg.base.overwrite)