Ejemplo n.º 1
0
 def prepare_features(self):
     self.pretrain_train_features = utils.generate_features(
         self.QR_weak_train_path, 'QR_weak', self.QR_weak_train_slice)
     self.pretrain_dev_features = utils.generate_features(
         self.QR_weak_dev_path, 'QR_weak', self.QR_weak_dev_slice)
     self.finetune_train_features = utils.generate_features(
         self.QR_train_path, 'QR')
     self.finetune_devs_features = [
         utils.generate_features(self.QR_dev_dir + str(i) + '.csv', 'QR')
         for i in range(7)
     ]
Ejemplo n.º 2
0
def index_images(folder, features_path, mapping_path, model, glove_path):
    print("Now indexing images...")
    word_vectors = utils.load_glove_vectors(glove_path)
    _, _, paths = utils.load_paired_img_wrd(folder=folder,
                                            word_vectors=word_vectors)
    images_features, file_index = utils.generate_features(paths, model)
    utils.save_features(features_path, images_features, mapping_path,
                        file_index)
    return images_features, file_index
Ejemplo n.º 3
0
    def fit(self, X, y, cv=3):
        """
            Fits wrapper.

            Parameters
            ----------
            X : array-like, shape (n_features,n_samples)
                The training input samples.
            y : array-like, shape (n_features,n_samples)
                the target values.
            cv : int
                Number of folds in cross-validation
            Returns
            ------
            None

            See Also
            --------
            Examples
            --------

        """
        features_ranks = dict(zip(generate_features(X), self.__measure(X, y)))
        sorted_features_ranks = OrderedDict(
            sorted(features_ranks.items(), key=lambda x: x[1]))
        selected_features = np.array(
            [feature for feature in sorted_features_ranks])
        number_of_features_left_to_remove = self.__n_features__

        self.__estimator__.fit(X[:, selected_features], y)
        accuracy = get_current_cv_accuracy(self.__estimator__, X, y,
                                           selected_features, cv)
        i = 0
        self.best_score = accuracy
        while len(sorted_features_ranks) != i and i < len(selected_features):
            iteration_features = np.delete(selected_features, i)
            self.__estimator__.fit(X[:, iteration_features], y)

            iteration_accuracy = get_current_cv_accuracy(
                self.__estimator__, X, y, iteration_features, cv)
            if iteration_accuracy > self.best_score:
                selected_features = iteration_features
                number_of_features_left_to_remove -= 1
                self.best_score = iteration_accuracy
                if not number_of_features_left_to_remove:
                    break
            else:
                i += 1

        self.features__ = selected_features
Ejemplo n.º 4
0
    def fit_criterion_measure(X, y):
        x = np.asarray(X)  # Converting input data to numpy array
        y = np.asarray(y.reshape((-1, )))

        fc = np.zeros(
            x.shape[1]
        )  # Array with amounts of correct predictions for each feature

        tokens_n = np.max(y) + 1  # Number of different class tokens

        centers = np.empty(
            tokens_n
        )  # Array with centers of sets of feature values for each class token
        variances = np.empty(
            tokens_n
        )  # Array with variances of sets of feature values for each class token
        # Each of arrays above will be separately calculated for each feature

        distances = np.empty(
            tokens_n
        )  # Array with distances between sample's value and each class's center
        # This array will be separately calculated for each feature and each sample

        for feature_index, feature in enumerate(x.T):  # For each feature
            # Initializing utility structures
            class_values = [
                [] for _ in range(tokens_n)
            ]  # Array with lists of feature values for each class token
            for index, value in enumerate(y):  # Filling array
                class_values[value].append(feature[index])
            for token, values in enumerate(
                    class_values
            ):  # For each class token's list of feature values
                tmp_arr = np.array(values)
                centers[token] = np.mean(tmp_arr)
                variances[token] = np.var(tmp_arr)

            # Main calculations
            for sample_index, value in enumerate(
                    feature):  # For each sample value
                for i in range(tokens_n):  # For each class token
                    # Here can be raise warnings by 0/0 division. In this case, default results
                    # are interpreted correctly
                    distances[i] = np.abs(value - centers[i]) / variances[i]
                fc[feature_index] += np.argmin(distances) == y[sample_index]

        fc /= y.shape[0]
        return dict(zip(generate_features(x), fc))
Ejemplo n.º 5
0
def process_fov(fov_image,
                segmentation_output,
                output_dir,
                objects_per_patch=50,
                shape="bitmap"):
    """
    Generates the supervisely directory for the given gov.
    
    Arguments:
        fov_image: ndarray 2D 
            Numpy array to the gray scale image 
        segmentation_output: ndarray 2D uint16
            The instance segmentation output. One int id per object.
        output_dir: foldername
            The location of the output supervisely folder
        objects_per_patch: int
            Number of objects per supervisely ROI
        shape: string ["bitmap", "polygon"]
            The shape of nucleus in the supervisely project. 
    """

    scaled = generate_bmp(fov_image)
    project_dir = output_dir
    masks = segmentation_output
    if os.path.isdir(project_dir):
        shutil.rmtree(project_dir)

    os.mkdir(project_dir)

    ann_dir = os.path.join(project_dir, "ann")
    image_dir = os.path.join(project_dir, "img")

    os.mkdir(image_dir)
    os.mkdir(ann_dir)

    #Create the template for the project directory
    meta_file = os.path.join(project_dir, "..", "meta.json")
    meta_json = generate_project_template(shape)

    with open(meta_file, 'w') as outfile:
        json.dump(meta_json, outfile)

    edges, center_blobs, areas, borders, contours = generate_features(masks)

    x_dim = masks.shape[0]
    y_dim = masks.shape[1]

    grid_size, id_list = calcuate_grid_size(masks, objects_per_patch)

    #sort the contours based on their centroid. Every ROI will be decomposed of 5 vertical stripes,
    #then the objects will be sorted by their horizontal index
    contours = sorted(
        contours, key=lambda k: [int(k[2][0] / (grid_size / 5)), -1 * k[2][1]])

    processed_ids = set([])
    x_index = 0
    for x_start in range(0, x_dim, grid_size):
        y_index = 0
        for y_start in range(0, y_dim, grid_size):
            lower_bound = x_start
            upper_bound = x_start + grid_size

            left_bound = y_start
            right_bound = y_start + grid_size

            if upper_bound > x_dim:
                upper_bound = x_dim
            if right_bound > y_dim:
                right_bound = y_dim

            roi_ids = list(
                np.unique(masks[lower_bound:upper_bound,
                                left_bound:right_bound]))
            if 0 in roi_ids:
                roi_ids.remove(0)

            roi_ids_set = set(roi_ids)

            roi_to_be_processed = list(roi_ids_set.difference(processed_ids))
            roi_already_processed = list(roi_ids_set & processed_ids)
            #to_be_processed = list(id_list & roi_ids_set)

            #already_processed = region_ids_set.difference(id_list)

            #update id_list with the remaining ids that will be processed
            #id_list = id_list.difference(region_ids_set)
            processed_ids.update(roi_to_be_processed)

            print("{0}:{1}, {2}:{3} to process {4}, already_processed: {5}".
                  format(lower_bound, upper_bound, left_bound, right_bound,
                         len(roi_to_be_processed), len(roi_already_processed)))

            if shape == "polygon":
                contours_to_process = [
                    cont[0] for cont in contours
                    if cont[1] in roi_to_be_processed
                ]
                contours_already_processed = [
                    cont[0] for cont in contours
                    if cont[1] in roi_already_processed
                ]
                object_func = create_polygon
            elif shape == "bitmap":
                contours_to_process = [(cont[3], cont[4]) for cont in contours
                                       if cont[1] in roi_to_be_processed]
                contours_already_processed = [
                    (cont[3], cont[4]) for cont in contours
                    if cont[1] in roi_already_processed
                ]
                object_func = create_bitmap

            bb = (left_bound, upper_bound, right_bound, lower_bound)
            annotations = create_ann(
                masks.shape,
                contours_to_process,
                object_func,
                bb=bb,
                already_processed=contours_already_processed)

            result_prefix = "{0}_{1}_{2}_{3}_{4}_{5}".format(
                x_index, y_index, lower_bound, upper_bound, left_bound,
                right_bound)

            ann_file_name = result_prefix + ".json"
            ann_file = os.path.join(ann_dir, ann_file_name)

            with open(ann_file, 'w') as outfile:
                json.dump(annotations, outfile)

            image_file_name = result_prefix + ".bmp"
            image_file = os.path.join(image_dir, image_file_name)

            imsave(image_file, scaled)
            #shutil.copyfile(bmp_image_file, image_file)
            y_index += 1
        x_index += 1
def load_data_features():
    """
    Loads the data, if the data is not splitted yet the data will be split in a train and val set
    """

    RANDOM_STATE = 123

    train_file = Path("data/train.csv")

    if train_file.exists():
        train = pd.read_csv("data/train.csv")
        val = pd.read_csv("data/val.csv")
        test = pd.read_csv("data/test.csv")

        features_train = pd.read_csv("data/features_train.csv")
        features_val = pd.read_csv("data/features_val.csv")
        features_test = pd.read_csv("data/features_test.csv")

    else:
        # Split normal data
        train_cola = pd.read_csv("data/SemEval/olid-training-v1.0.tsv",
                                 delimiter="\t")
        test_cola = pd.read_csv("data/SemEval/testset-levela.tsv",
                                delimiter="\t")
        labels_cola = pd.read_csv("data/SemEval/labels-levela.csv",
                                  header=None)
        labels_cola.columns = ['id', 'subtask_a']

        test = pd.merge(test_cola, labels_cola, on='id')

        # Remove duplicates
        train_cola = train_cola.drop_duplicates("tweet")
        test = test.drop_duplicates("tweet")

        train, val = train_test_split(train_cola,
                                      test_size=0.2,
                                      random_state=RANDOM_STATE)
        train.reset_index(drop=True)
        val.reset_index(drop=True)

        train = train[["tweet", "subtask_a"]]
        val = val[["tweet", "subtask_a"]]
        test = test[["tweet", "subtask_a"]]

        train.columns = ['text', 'label']
        val.columns = ['text', 'label']
        test.columns = ['text', 'label']

        # Generate features
        features_train = generate_features(train)
        features_val = generate_features(val)
        features_test = generate_features(test)

        train.to_csv("data/train.csv", index=False)
        val.to_csv("data/val.csv", index=False)
        test.to_csv("data/test.csv", index=False)

        features_train.to_csv("data/features_train.csv")
        features_val.to_csv("data/features_val.csv")
        features_test.to_csv("data/features_test.csv")

    return train, val, test, features_train, features_val, features_test
Ejemplo n.º 7
0
def load_data(subtask, use_features=False):
    """
    Loads the data, if the data is not splitted yet the data will be split in a train and val set

    Args:
        subtask = a, b, c
    """

    subtask_name = "subtask_" + subtask

    RANDOM_STATE = 123

    train_file = Path("data/train_" + subtask + ".csv")

    if train_file.exists():
        train = pd.read_csv("data/train_" + subtask + ".csv")
        val = pd.read_csv("data/val_" + subtask + ".csv")
        test = pd.read_csv("data/test_" + subtask + ".csv")

        if use_features:
            features_train = pd.read_csv("data/features_train_" + subtask +
                                         ".csv")
            features_val = pd.read_csv("data/features_val_" + subtask + ".csv")
            features_test = pd.read_csv("data/features_test_" + subtask +
                                        ".csv")

    else:
        # Split normal data
        train_cola = pd.read_csv("data/SemEval/olid-training-v1.0.tsv",
                                 delimiter="\t")
        test_cola = pd.read_csv("data/SemEval/testset-level" + subtask +
                                ".tsv",
                                delimiter="\t")
        labels_cola = pd.read_csv("data/SemEval/labels-level" + subtask +
                                  ".csv",
                                  header=None)
        labels_cola.columns = ['id', subtask_name]

        test = pd.merge(test_cola, labels_cola, on='id')

        # Remove duplicates
        train_cola = train_cola.drop_duplicates("tweet")
        test = test.drop_duplicates("tweet")

        #Remove nan in a certain column
        train_cola = train_cola.dropna(subset=[subtask_name])
        test = test.dropna(subset=[subtask_name])

        train, val = train_test_split(train_cola,
                                      test_size=0.2,
                                      random_state=123)
        train.reset_index(drop=True)
        val.reset_index(drop=True)

        train = train[["tweet", subtask_name]]
        val = val[["tweet", subtask_name]]
        test = test[["tweet", subtask_name]]

        train.columns = ['text', 'label']
        val.columns = ['text', 'label']
        test.columns = ['text', 'label']

        train.to_csv("data/train_" + subtask + ".csv", index=False)
        val.to_csv("data/val_" + subtask + ".csv", index=False)
        test.to_csv("data/test_" + subtask + ".csv", index=False)

        if use_features:
            # Generate features
            features_train = generate_features(train)
            features_val = generate_features(val)
            features_test = generate_features(test)

            features_train.to_csv("data/features_train_" + subtask + ".csv",
                                  index=False)
            features_val.to_csv("data/features_val_" + subtask + ".csv",
                                index=False)
            features_test.to_csv("data/features_test_" + subtask + ".csv",
                                 index=False)

            return train.head(10), val.head(10), test.head(
                10), features_train.head(10), features_val.head(
                    10), features_test.head(10)

    return train, val, test
Ejemplo n.º 8
0
def run(cfg):
    generate_features(globals(), cfg.overwrite)
Ejemplo n.º 9
0
    def run(self, X, y, feature_names=None):
        """
        X: shape(n_samples, n_features) feature matrix, an array filled with features.
        y: shape(n_samples, 1) label matrix, used to calculate SU value for each feature.

        :return: shape(n_samples, self.__n_features) an array filled with the selected features.
        """
        # Calculate the entropy of y.
        entropy = self.cal_entropy(y)
        feature_names = generate_features(X, feature_names)
        # Calculate conditional entropy for each feature.
        self.feature_scores = dict()

        for index in range(len(X.T)):
            dict_i = dict()
            for i in range(len(X.T[index])):
                if X.T[index][i] not in dict_i:
                    dict_i.update({X.T[index][i]: [i]})
                else:
                    dict_i[X.T[index][i]].append(i)
            # print(dict_i)

            # Conditional entropy of a feature.
            con_entropy = 0.0
            # Entropy of each feature
            entropy_x = self.cal_entropy(X[:, index])
            # get corresponding values in y.
            for f in dict_i.values():
                # Probability of each class in a feature.
                p = len(f) / len(X.T[0])
                # Dictionary of corresponding probability in labels.
                dict_y = dict()
                for i in f:
                    if y.T[i] not in dict_y:
                        dict_y.update({y.T[i]: 1})
                    else:
                        dict_y[y.T[i]] += 1

                # calculate the probability of corresponding label.
                sub_entropy = 0.0
                for l in dict_y.values():
                    sub_entropy += -l / sum(dict_y.values()) * math.log(
                        l / sum(dict_y.values()), 2)

                con_entropy += sub_entropy * p
            # self._features.append(
            #     {"su": 2 * (entropy - con_entropy) / (entropy_x + entropy), "index": feature_names[index]})

            self.feature_scores[feature_names[index]] = 2 * (
                entropy - con_entropy) / (entropy_x + entropy)

        # Sort by symmetric uncertainty in descending order.
        new_list = list(
            sorted(self.feature_scores.items(),
                   reverse=True,
                   key=lambda k: k[1]))

        for item in new_list[self.__n_features:]:
            X = np.delete(X, [item[1]], axis=1)

        return X
Ejemplo n.º 10
0
def run(cfg):
    # overwriteがfalseなら上書きはされない
    # globals()からこのファイルの中にある特徴量クラスが選別されてそれぞれ実行される
    print("Start!")
    generate_features(globals(), cfg.base.overwrite)
    print("sucess!")
Ejemplo n.º 11
0
def run(cfg):
    # overwriteがfalseなら上書きはされない
    # globals()からこのファイルの中にある特徴量クラスが選別されてそれぞれ実行される
    generate_features(globals(), cfg.base.overwrite)