Exemple #1
0
# Will be sufficient and leaves 90% of the data for training purposes

n_split = 10

k_fold = IterativeStratification(n_splits=n_split, order=1)

i = 1

# This is just to check that splits are somewhat correct and then crucially saves the data set splits for each fold
# Into a file for analysis later

# Save the training and validation indices for each fold so that we can use them on the server
train_indices = []
val_indices = []

for train, val in k_fold.split(total_x, svm_y):
    print("Fold" + str(i))
    temp_1 = proportions(y[train])
    temp_2 = proportions(y[val])
    print("Train set")
    print(temp_1)
    print("Validation set")
    print(temp_2)
    i += 1
    train_indices.append(train)
    val_indices.append(val)

# In[ ]:

# Now save the indices for GPU use on server
#with open('RNN Training Indices.pkl', 'wb') as f:
    def _featurecollection(self) -> Tuple[np.array, np.array, list]:
        """
        Runs the feature collection workflow.

        Returns:
            Tuple[np.array, np.array, list] -- numpy arrays of features and labels and list of names
        """
        feature_list = FeatureCollector.create_feature_list(
            self.picklefiles, self.forbidden_list, self.old_format)
        label_raw = read_pickle(self.labelpath)
        # collectorlogger.info(f'found {len(label_raw)} labels')
        label_list = FeatureCollector.make_labels_table(label_raw)
        df = FeatureCollector._create_clean_dataframe(feature_list, label_list,
                                                      self.drop_duplicates)

        # shuffle dataframe for the next steps to ensure randomization
        df = df.sample(frac=1).reset_index(drop=True)

        # set offset of select features
        offset = 0
        if self.racsdf is not None:
            offset = len(self.selected_racs)
            df = FeatureCollector._merge_racs_frame(df, self.racsdf,
                                                    self.selected_racs)

        if self.percentage_holdout > 0:
            # Make stratified split that also makes sure that no structure from the training set is in the test set
            # This is important as the chmemical enviornments in structures can be quite similar (parsiomny principle of Pauling)
            # We do not want to leak this information from training into test set
            df["base_name"] = [n.strip("0123456789") for n in df["name"]]
            df_name_select = df.drop_duplicates(subset=["base_name"])
            df_name_select["numbers"] = (
                df_name_select["metal"].astype("category").cat.codes)
            stratifier = IterativeStratification(
                n_splits=2,
                order=2,
                sample_distribution_per_fold=[
                    self.percentage_holdout,
                    1.0 - self.percentage_holdout,
                ],
            )
            train_indexes, test_indexes = next(
                stratifier.split(df_name_select,
                                 df_name_select[["oxidationstate",
                                                 "numbers"]]))

            train_names = df_name_select.iloc[train_indexes]
            test_names = df_name_select.iloc[test_indexes]
            train_names = list(train_names["base_name"])
            test_names = list(test_names["base_name"])

            df_train = df[df["base_name"].isin(train_names)]
            df_test = df[df["base_name"].isin(test_names)]

            x, self.y, self.names = FeatureCollector._get_x_y_names(df_train)
            self.x = FeatureCollector._select_features(self.selected_features,
                                                       x, self.outdir_helper,
                                                       offset)

            x_test, self.y_test, self.names_test = FeatureCollector._get_x_y_names(
                df_test)
            self.x_test = FeatureCollector._select_features(
                self.selected_features, x_test, self.outdir_helper, offset)

        else:  # no seperate holdout set
            x, self.y, self.names = FeatureCollector._get_x_y_names(df)
        if (
                self.training_set_size
        ):  # perform farthest point sampling to selet a fixed number of training points
            collectorlogger.debug(
                "will now perform farthest point sampling on the feature matrix"
            )
            # Write one additional holdout set
            assert self.training_set_size < len(df_train)

            x, self.y, self.names = FeatureCollector._get_x_y_names(df_train)
            x = FeatureCollector._select_features(self.selected_features, x,
                                                  self.outdir_helper, offset)

            # indices = greedy_farthest_point_samples(x, self.training_set_size)
            indices = apricot_select(x, self.training_set_size)

            _df_train = df_train

            good_indices = _df_train.index.isin(indices)
            df_train = _df_train[good_indices]
            x, self.y, self.names = FeatureCollector._get_x_y_names(df_train)

            df_validation = _df_train[~good_indices]
            x_valid, self.y_valid, self.names_valid = FeatureCollector._get_x_y_names(
                df_validation)

            self.x_valid = FeatureCollector._select_features(
                self.selected_features, x_valid, self.outdir_helper, offset)

        self.x = FeatureCollector._select_features(self.selected_features, x,
                                                   self.outdir_helper, offset)
        collectorlogger.debug("the feature matrix shape is %s", self.x.shape)
Exemple #3
0
# Will be sufficient and leaves 90% of the data for training purposes

n_split = 10

k_fold = IterativeStratification(n_splits=n_split, order=1)

i = 1

# This is just to check that splits are somewhat correct and then crucially saves the data set splits for each fold
# Into a file for analysis later

# Save the training and validation indices for each fold so that we can use them on the server
train_indices = []
val_indices = []

for train, val in k_fold.split(x, y):
    print("Fold" + str(i))
    temp_1 = proportions(y[train])
    temp_2 = proportions(y[val])
    print("Train set")
    print(temp_1)
    print("Validation set")
    print(temp_2)
    i += 1
    train_indices.append(train)
    val_indices.append(val)

# In[ ]:

# Now save the indices for GPU use on server as we cannot download the multilearn on there
with open('RNN Training Indices.pkl', 'wb') as f:
 def test_if_stratification_works(self):
     stratifier = IterativeStratification(n_splits=2, order=1)
     X = np.matrix([[0], [1], [2], [3]])
     y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]])
     self.assertEqual(len(list(stratifier.split(X, y))), 2)
def cross_validation(clfs, X, y_true, cardio_dict, gender_dict, num_fold=10):
    from sklearn.model_selection import StratifiedKFold
    from prettytable import PrettyTable
    from tensorflow.keras.utils import to_categorical
    from skmultilearn.model_selection import IterativeStratification

    skf = IterativeStratification(n_splits=num_fold, random_state=10)
    wrong_instances_clf = {}
    for clf_name in clfs:

        avg_matrics_cardio = [0, 0, 0, 0]
        avg_matrics_gender = [0, 0, 0, 0]

        print('we are trying classifier: ', clf_name)
        clf = clfs[clf_name]
        best_f_score = 0
        best_clf = None
        i = 1

        t_cardio = PrettyTable(
            ['Fold', 'acc', 'precision', 'recall', 'f_score'])
        # t_cardio.title = 'cardio'

        t_gender = PrettyTable(
            ['Fold', 'acc', 'precision', 'recall', 'f_score'])
        # t_gender.title = 'gender'

        wrong_instances = []
        for train_index, val_index in skf.split(X, y_true):

            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y_true[train_index], y_true[val_index]

            backtrack_dict = {}
            for index in range(len(val_index)):
                backtrack_dict[index] = val_index[index]

            clf.fit(X_train, y_train, verbose=0, epochs=100, batch_size=8192)
            acc, precision, recall, f_score = model_evaluation(
                clf, X_val, y_val, cardio_dict, gender_dict)
            data_util.reset_weights(clf)

            # wrong_instances.extend(wrong_instances_fold)

            # print(' '+str(i)+' ', acc, precision, recall, f_score)
            t_cardio.add_row([
                ' ' + str(i) + ' ', '{0:.3f}'.format(acc[0]),
                '{0:.3f}'.format(precision[0]), '{0:.3f}'.format(recall[0]),
                '{0:.3f}'.format(f_score[0])
            ])
            t_gender.add_row([
                ' ' + str(i) + ' ', '{0:.3f}'.format(acc[1]),
                '{0:.3f}'.format(precision[1]), '{0:.3f}'.format(recall[1]),
                '{0:.3f}'.format(f_score[1])
            ])

            avg_matrics_cardio[0] += acc[0]
            avg_matrics_cardio[1] += precision[0]
            avg_matrics_cardio[2] += recall[0]
            avg_matrics_cardio[3] += f_score[0]
            avg_matrics_gender[0] += acc[1]
            avg_matrics_gender[1] += precision[1]
            avg_matrics_gender[2] += recall[1]
            avg_matrics_gender[3] += f_score[1]
            i += 1
        i -= 1
        t_cardio.add_row([
            'avg', '{0:.3f}'.format(avg_matrics_cardio[0] / i),
            '{0:.3f}'.format(avg_matrics_cardio[1] / i),
            '{0:.3f}'.format(avg_matrics_cardio[2] / i),
            '{0:.3f}'.format(avg_matrics_cardio[3] / i)
        ])
        t_gender.add_row([
            'avg', '{0:.3f}'.format(avg_matrics_gender[0] / i),
            '{0:.3f}'.format(avg_matrics_gender[1] / i),
            '{0:.3f}'.format(avg_matrics_gender[2] / i),
            '{0:.3f}'.format(avg_matrics_cardio[3] / i)
        ])

        # print('avg',avg_matrics[0]/i,avg_matrics[1]/i,avg_matrics[2]/i,avg_matrics[3]/i)
        # wrong_instances_clf[clf_name] = wrong_instances
        print('+--------------------------------------------+')
        print('|                  cardio                    |')
        print('+--------------------------------------------+')
        print(t_cardio)
        print('+--------------------------------------------+')
        print('|                   gender                   |')
        print('+--------------------------------------------+')
        print(t_gender)

    return
Exemple #6
0
def eval_model(model, X_train, y_train, id_=None):
    start_time = time()
    logging.info('*' * 20)
    logging.info("Evaluating model {}".format(id_ if id_ else model))

    n_splits = 3
    output = None

    # Try to load saved result from disk if exists
    if id_:
        output = Path('output') / id_
        output.mkdir(parents=True, exist_ok=True)
        if path.exists(output / 'score.pkl'):
            logging.debug("Loading result from disk")
            log_loss_, auc, f1 = pickle.load(open(output / 'score.pkl', 'rb'))
            logging.info("The Average Log Loss is {}".format(log_loss_))
            logging.info("The Average AUC is {}".format(auc))
            logging.info("The Average f1 is {}".format(f1))
            return log_loss_, auc, f1

    # Deprecated sklearn k-forld
    # kf = StratifiedKFold(n_splits=n_splits)
    # kf.get_n_splits(X_train)

    kf = IterativeStratification(n_splits=3, order=1)

    log_loss_, auc, f1 = 0.0, 0.0, 0.0
    for i, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
        X_train_, X_val_ = X_train.iloc[train_index].values, X_train.iloc[
            test_index].values
        y_train_, y_val_ = y_train.iloc[train_index].values, y_train.iloc[
            test_index].values

        # Add dummy sample to make sure every column has 2 labels
        X_train_ = np.vstack((X_train_, np.zeros((1, X_train_.shape[1]))))
        y_train_ = np.vstack((y_train_, np.ones((1, y_train_.shape[1]))))

        model.fit(X_train_, y_train_)
        y_pred_ = model.predict(X_val_)

        log_loss_val, auc_val, f1_val = scorer(y_val_, y_pred_)

        # Pickle y_val_ and y_pred_
        if id_:
            pickle.dump((y_val_, y_pred_),
                        open(output / "val_{}.pkl".format(i), 'wb'))

        # Update the scores
        log_loss_ += log_loss_val
        auc += auc_val
        f1 += f1_val

    log_loss_ /= n_splits
    auc /= n_splits
    f1 /= n_splits
    if id_:
        pickle.dump((log_loss_, auc, f1), open(output / 'score.pkl', 'wb'))
    logging.info("The Average Log Loss is {}".format(log_loss_))
    logging.info("The Average AUC is {}".format(auc))
    logging.info("The Average f1 is {}".format(f1))
    logging.info("Used {:.2f}s".format(time() - start_time))
    return log_loss_, auc, f1
Exemple #7
0
def evaluate(model,
             adjacency_matrix,
             features,
             labels,
             labels_mask,
             proportions,
             n_trials=1,
             random_state=None):
    scores = {
        'proportion': proportions,
        'micro': np.zeros(len(proportions)),
        'macro': np.zeros(len(proportions)),
        'std': np.zeros(len(proportions)),
        'c': np.zeros(len(proportions))
    }
    for i, train_ratio in enumerate(proportions):
        indices = np.arange(adjacency_matrix.shape[0])
        labeled_indices = indices[labels_mask]
        not_labeled_indices = np.setdiff1d(indices, labeled_indices)
        std = list()
        for _ in range(n_trials):
            stratifier = IterativeStratification(
                n_splits=2,
                order=2,
                sample_distribution_per_fold=[1.0 - train_ratio, train_ratio],
                random_state=random_state)
            model.__init__()
            train_ind_l, test_ind_l = next(stratifier.split(labels, labels))
            train_ids_nl, _test_ids_nl = sklearn.model_selection.train_test_split(
                not_labeled_indices,
                train_size=train_ratio,
                test_size=1 - train_ratio)
            train_ids = np.concatenate(
                [labeled_indices[train_ind_l],
                 train_ids_nl])  # order is important, labeled first
            test_ids = labeled_indices[test_ind_l]
            adjacency_matrix_train = adjacency_matrix[
                train_ids][:, train_ids].copy()
            adjacency_matrix_train.eliminate_zeros()
            features_train = [features[i] for i in train_ids]
            features_test = [features[i] for i in test_ids]
            labels_train = labels[train_ind_l]
            labels_test = labels[test_ind_l]
            model.fit(adjacency_matrix_train, features_train)
            vectors_train = np.array(
                model.get_embeddings_new(features_train))[:len(train_ind_l)]
            vectors_test = np.array(model.get_embeddings_new(features_test))
            logger.debug(
                f"train: {train_ids.shape} nodes, {labels_train.sum()} labels")
            logger.debug(
                f"test: {test_ids.shape} nodes, {labels_test.sum()} labels, (+{_test_ids_nl.shape} forgotten nodes)"
            )
            logger.debug(
                f"adjacency: {adjacency_matrix.shape}, {adjacency_matrix_train.shape}"
            )
            logger.debug(
                f"train vectors: {vectors_train.shape}, test vectors: {vectors_test.shape}"
            )
            mi, ma, c = train_and_predict(vectors_train, vectors_test,
                                          labels_train, labels_test)
            std.append(mi)
            scores['micro'][i] += mi / n_trials
            scores['macro'][i] += ma / n_trials
            scores['c'][i] += c / n_trials
        scores['std'][i] += np.array(std).std()

    return scores
Exemple #8
0
# #Feature Engineering
# sns.pairplot(dataf.sample(2))
# sns_plot.savefig("pairplot.png")

# plt.clf() # Clean parirplot figure from sns 
# Image(filename='pairplot.png') # Show pairplot as image

dataf.dtypes
dataf.drop(["Patient Name"],axis=1)
# print(dataf.iloc[:,52:88])
X = dataf.iloc[:,1:53]
y = dataf.iloc[:,53:90]

# K-fold Iterative Stratification
k_fold = IterativeStratification(n_splits=40, order=1)
for train_index, test_index in k_fold.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    # classifier.fit(X_train, y_train)
    # result = classifier.predict(X_test)


mlb = MultiLabelBinarizer()

start=time.time()
classifier = BinaryRelevance(
    classifier =  RandomForestClassifier(n_estimators=100,criterion='gini'),
    require_dense = [False, True]
)

classifier.fit(X_train, y_train)
def train_test_split(data,
                     test_size=0.2,
                     group='patient_id',
                     labels=None,
                     seed=None):
    # Author: Kristian & Tobias
    """
     Split dataset into random train and test subsets, while keeping all the
     samples of one patient in the same set.

     Parameters:
        data (pd.Dataframe):
            The data that should be splitted.
            Needs to contain a column with the name specified by the group parameter.
        test_size (float): (Default: 0.2)
            A number between 0.0 and 1.0 specifying the relative size of the test set.
        group (string): (Default: 'patient_id')
            The name of the column that the data should be split by.
            Having multiple entries of the same value in this column will result in a split,
            where all of these entries will end up in the same subset.
        labels (list):
            A list of labels that are taken in consideration when performing the stratified
            split
        seed (int): (Default: None)
            Controlls the shuffling applied to the data before the it is split.
     Returns:
        train_data, test_data (touple of lists):
            Two lists consisting of the train and test split
    """

    if not group in data.columns:
        raise Exception('The column ' + group + ' does not exist')

    if labels:

        stratifier = IterativeStratification(
            n_splits=2,
            order=2,
            sample_distribution_per_fold=[test_size, 1.0 - test_size],
            random_state=seed)

        # split into stratified test and train set
        train_idx, test_idx = next(stratifier.split(data, data[labels]))

        train_data = data.iloc[train_idx]
        test_data = data.iloc[test_idx]

        # get group ids that are found in both sets
        splitted_group_ids = np.intersect1d(train_data[group].to_numpy(),
                                            test_data[group].to_numpy())

        train_value_counts = train_data[group].value_counts()
        test_value_counts = test_data[group].value_counts()

        # iterate through groups and move either to test or train based on
        # where the value count of the group is higher
        for group_id in splitted_group_ids:
            if train_value_counts[group_id] > test_value_counts[group_id]:
                rows = test_data[test_data[group] == group_id]
                train_data = pd.concat([train_data, pd.DataFrame(rows)])

                test_data = test_data[test_data[group] != group_id]
            else:
                rows = train_data[train_data[group] == group_id]
                test_data = pd.concat([test_data, pd.DataFrame(rows)])

                train_data = train_data[train_data[group] != group_id]

    else:
        # create the group shuffle splitter
        shuffle_split = GroupShuffleSplit(test_size=test_size,
                                          random_state=seed)

        # define groups as patient ids
        groups = data[group].to_numpy()

        train_idx, test_idx = next(shuffle_split.split(data, groups=groups))

        train_data = data.iloc[train_idx]
        test_data = data.iloc[test_idx]

    return (train_data, test_data)
Exemple #10
0
    def perform_five_fold(self, model, documents, annotations, doc_ids,
                          pipeline_parameters):
        metrics = list()
        # store list of documents ids per fold
        folds = list()
        # turning into numpy arrays to be able to access values with index array
        documents_np_array = np.array(documents)
        annotations_np_array = np.array(annotations, dtype=object)
        doc_ids_np_array = np.array(doc_ids)
        ann_list = list()

        for ann in annotations_np_array:
            ann_list = ann_list + list([x[2] for x in ann])
        # getting unique label names in annotations
        unique_ann_list = list(set(ann_list))

        # array to store multilabel values
        multilabel_array = []
        for ann in annotations_np_array:
            multilabel_array.append([unique_ann_list.index(x[2]) for x in ann])

        multilabel_binarizer = MultiLabelBinarizer().fit_transform(
            multilabel_array)

        skf = IterativeStratification(n_splits=5, order=1)

        total_metrics = {}

        for train_index, test_index in skf.split(documents_np_array,
                                                 multilabel_binarizer):
            # get annotations train and test datasets
            train_annotations = annotations_np_array[train_index]
            test_annotations = annotations_np_array[test_index]

            # get documents train and test datasets
            train_documents = documents_np_array[train_index]
            test_documents = documents_np_array[test_index]

            fold_metrics = self.perform_fold(
                model, [train_documents.tolist(),
                        train_annotations.tolist()],
                [test_documents.tolist(),
                 test_annotations.tolist()], pipeline_parameters)

            # saving docs used to train fold
            fold_doc_ids = doc_ids_np_array[train_index]
            folds.append(fold_doc_ids.tolist())

            # saving fold metrics
            metrics.append(fold_metrics)

            for key in fold_metrics.keys():
                if key not in total_metrics:
                    total_metrics[key] = {
                        "FN": 0,
                        "FP": 0,
                        "TP": 0,
                        "TN": 0,
                        "f1": 0,
                        "precision": 0,
                        "recall": 0,
                        "acc": 0
                    }
                total_metrics[key][
                    "FN"] = total_metrics[key]["FN"] + fold_metrics[key]["FN"]
                total_metrics[key][
                    "FP"] = total_metrics[key]["FP"] + fold_metrics[key]["FP"]
                total_metrics[key][
                    "TP"] = total_metrics[key]["TP"] + fold_metrics[key]["TP"]
                total_metrics[key][
                    "TN"] = total_metrics[key]["TN"] + fold_metrics[key]["TN"]

        average_metrics = {}
        for label in total_metrics.keys():
            avg_metric = {}
            avg_metric["FN"] = total_metrics[label]["FN"] / 5
            avg_metric["FP"] = total_metrics[label]["FP"] / 5
            avg_metric["TP"] = total_metrics[label]["TP"] / 5
            avg_metric["TN"] = total_metrics[label]["TN"] / 5
            if (avg_metric["TP"] + avg_metric["FN"]) != 0:
                avg_metric["recall"] = avg_metric["TP"] / (avg_metric["TP"] +
                                                           avg_metric["FN"])
            else:
                avg_metric["recall"] = 1.0
            if (avg_metric["TP"] + avg_metric["FP"]) != 0:
                avg_metric["precision"] = avg_metric["TP"] / (
                    avg_metric["TP"] + avg_metric["FP"])
            else:
                avg_metric["precision"] = 0.0
            if (avg_metric["precision"] + avg_metric["recall"]) != 0:
                avg_metric["f1"] = 2 * (
                    avg_metric["precision"] * avg_metric["recall"]) / (
                        avg_metric["precision"] + avg_metric["recall"])
            else:
                avg_metric["f1"] = 0
            avg_metric["acc"] = (avg_metric["TP"] + avg_metric["TN"]) / (
                avg_metric["TP"] + avg_metric["TN"] + avg_metric["FP"] +
                avg_metric["FN"])

            average_metrics[label] = avg_metric

        return metrics, folds, average_metrics
 def test_if_stratification_works(self):
     stratifier = IterativeStratification(n_splits=2, order=1)
     X = np.matrix([[0], [1], [2], [3]])
     y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]])
     self.assertEqual(len(list(stratifier.split(X, y))), 2)
def get_english_annotated_tweets_sets(config, folder, only_tokenizer=False):
    samples_limit_for_stratification = 8
    print config
    path = "./datasets/english_annotated_tweets/english_tweets/2_tokenized/" + config[
        'tweets_file']
    annotated_tweets = json.load(open(path, 'r'))
    print "Total number of annotated english tweets is", len(annotated_tweets)
    print "Sample tweet", annotated_tweets[0]
    samples_distribution_domain_all = {}
    samples_distribution_subdomain_all = {}
    samples_distribution_domain = {}
    samples_distribution_subdomain = {}
    total_codes = 0
    for tweet in annotated_tweets:
        tweet['domain_codes'] = []
        tweet['subdomain_codes'] = []
        for code in tweet['codes']:
            total_codes += 1
            code = code.strip()
            if code not in samples_distribution_subdomain_all:
                samples_distribution_subdomain_all[code] = 1
            else:
                samples_distribution_subdomain_all[code] += 1
            subdomain = code
            if subdomain not in GLOBAL_MANIFESTOS_SUBDOMAINS:
                #print tweet
                subdomain = SUBSUB_TO_SUB[subdomain.replace('_', '.')]
            domain = code[0]
            tweet['domain_codes'].append(domain)
            tweet['subdomain_codes'].append(subdomain)
            if 'domain' not in tweet:
                tweet['domain'] = domain
                tweet['subdomain'] = subdomain
                if subdomain not in samples_distribution_subdomain:
                    samples_distribution_subdomain[subdomain] = 1
                else:
                    samples_distribution_subdomain[subdomain] += 1
                if domain not in samples_distribution_domain:
                    samples_distribution_domain[domain] = 1
                else:
                    samples_distribution_domain[domain] += 1
            if domain not in DOMAIN_CLASSES:
                print tweet
            if domain not in samples_distribution_domain_all:
                samples_distribution_domain_all[domain] = 1
            else:
                samples_distribution_domain_all[domain] += 1
    """print "Annotated tweets distribution taking into account multi-label annotation-----------------------------------"
    for c in DOMAIN_CLASSES:
        print("Domain %s; Number of samples:  %s; Percentage: %.2f " % (c, samples_distribution_domain_all[c], samples_distribution_domain_all[c]/float(total_codes)*100))
    for c in GLOBAL_MANIFESTOS_SUBDOMAINS:
        if c in samples_distribution_subdomain_all:
            print("Subdomain %s; Number of samples: %s; Percentage: %.2f " % (c, samples_distribution_subdomain_all[c],samples_distribution_subdomain_all[c]/float(total_codes)*100))
    print "-----------------------------------------------------------------------------------------------------------" """
    print "Annotated tweets distribution with multiclass-annotation"
    banned_codes_for_classification = []
    for c in DOMAIN_CLASSES:
        print("Domain %s; Number of samples:  %s; Percentage: %.2f " %
              (c, samples_distribution_domain[c],
               samples_distribution_domain[c] / float(len(annotated_tweets)) *
               100))
    for c in GLOBAL_MANIFESTOS_SUBDOMAINS:
        if c in samples_distribution_subdomain:
            print("Subdomain %s; Number of samples: %s; Percentage: %.2f " %
                  (c, samples_distribution_subdomain[c],
                   samples_distribution_subdomain[c] /
                   float(len(annotated_tweets)) * 100))
            if samples_distribution_subdomain[
                    c] < samples_limit_for_stratification:
                print "NOT ENOUGH SAMPLES!!!!"
                banned_codes_for_classification.append(c)
    print "BANNNED!!!!!!!!!!!!!!!!"
    print banned_codes_for_classification
    if config['party'] and config['previous_phrase']:
        tweets_X = [[], [], []]
    elif config['party'] or config['previous_phrase']:
        tweets_X = [[], []]
    else:
        tweets_X = [[]]
    tweets_y = []
    party_encoder = one_hot_encoder(COUNTRY_PARTIES['english'])
    for tweet in annotated_tweets:
        if config['architecture'] == 'multi_label':
            class_to_pick = 'domain_codes'
        else:
            class_to_pick = config['class']
        if config['class'] == 'manifestos_subdomain':
            if config['architecture'] == 'multi_label':
                class_to_pick = 'subdomain_codes'
            else:
                class_to_pick = 'subdomain'
        if config['architecture'] == 'multi_label':
            tweets_y.append(
                multilabel_array_to_onehot(config['class'],
                                           tweet[class_to_pick]))
        else:
            tweets_y.append(tweet[class_to_pick])
        tweets_X[0].append(tweet['cleaned_text'])
        if config['previous_phrase']:
            if 'previous_tweet' in tweet:
                tweets_X[1].append(tweet['previous_tweet']['cleaned_text'])
            else:
                tweets_X[1].append([])
            if config['party']:
                tweets_X[2].append(party_enconder[tweet['party']])
        elif config['party']:
            tweets_X[1].append(party_encoder[tweet['party']])
    tweets_y = [tweets_y]
    o_phrases = tweets_X[
        0]  #Original phrases without its convertion to indexes
    o_eval_test_phrases = []
    o_train_phrases = []
    o_test_phrases = []
    o_train_eval_phrases = []
    if config['previous_phrase']:
        o_prev_phrases = tweets_X[1]
        o_prev_eval_test_phrases = []
        o_prev_train_phrases = []
        o_prev_test_phrases = []
        o_prev_train_eval_phrases = []

    data_X, data_y = load_json_data(
        config['dataset_folder'] + config['dataset'],
        config['previous_phrase'], config['previous_previous'],
        config['post_phrase'], config['party'], config['party_as_deconv'],
        config['class'], config['class_2'], False, False, False,
        config['language'])
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data_X[0] + tweets_X[0])
    if only_tokenizer:
        return tokenizer
    sequences_phrases = tokenizer.texts_to_sequences(tweets_X[0])
    if config['previous_phrase']:
        prev_phrases = tokenizer.texts_to_sequences(tweets_X[1])
        if config['party']:
            tweets_X_party = tweets_X[2]
    elif config['party']:
        tweets_X_party = tweets_X[1]
    if not config['no_padding_for_lstms']:
        tweets_X = pad_sequences(sequences_phrases,
                                 maxlen=config['max_phrase_length'],
                                 padding='post')
        if config['previous_phrase']:
            tweets_X_prev = pad_sequences(prev_phrases,
                                          maxlen=config['max_phrase_length'],
                                          padding='post')
    else:
        tweets_X = pad_sequences(sequences_phrases,
                                 maxlen=config['max_phrase_length'])
        if config['previous_phrase']:
            tweets_X_prev = pad_sequences(prev_phrases,
                                          maxlen=config['max_phrase_length'])
    tweets_X_tmp = []
    tweets_X_prev_tmp = []
    tweets_X_party_tmp = []
    tweets_y_tmp = []
    o_phrases_tmp = []
    o_prev_phrases_tmp = []
    if config['previous_phrase']:
        if not config['party']:
            for tweet, prev_tweet, y_label, o_phrase, o_prev_phrase in zip(
                    tweets_X, tweets_X_prev, tweets_y[0], o_phrases,
                    o_prev_phrases):
                if y_label in banned_codes_for_classification:
                    continue
                tweets_X_tmp.append(tweet)
                tweets_X_prev_tmp.append(prev_tweet)
                tweets_y_tmp.append(y_label)
                o_phrases_tmp.append(o_phrase)
                o_prev_phrases_tmp.append(o_prev_phrase)
        else:
            for tweet, prev_tweet, party, y_label, o_phrase, o_prev_phrase in zip(
                    tweets_X, tweets_X_prev, tweets_X_party, tweets_y[0],
                    o_phrases, o_prev_phrases):
                if y_label in banned_codes_for_classification:
                    continue
                tweets_X_tmp.append(tweet)
                tweets_X_prev_tmp.append(prev_tweet)
                tweets_X_party_tmp.append(party)
                tweets_y_tmp.append(y_label)
                o_phrases_tmp.append(o_phrase)
                o_prev_phrases_tmp.append(o_prev_phrase)
    elif config['party']:
        for tweet, party, y_label, o_phrase in zip(tweets_X, tweets_X_party,
                                                   tweets_y[0], o_phrases):
            if y_label in banned_codes_for_classification:
                continue
            tweets_X_tmp.append(tweet)
            tweets_X_party_tmp.append(party)
            tweets_y_tmp.append(y_label)
            o_phrases_tmp.append(o_phrase)
    else:
        for tweet, y_label, o_phrase in zip(tweets_X, tweets_y[0], o_phrases):
            if y_label in banned_codes_for_classification:
                continue
            tweets_X_tmp.append(tweet)
            tweets_y_tmp.append(y_label)
            o_phrases_tmp.append(o_phrase)

    tweets_X = [tweets_X_tmp]
    if config['previous_phrase']:
        tweets_X.append(tweets_X_prev_tmp)
    if config['party']:
        tweets_X.append(tweets_X_party_tmp)

    tweets_y = [tweets_y_tmp]
    o_phrases = o_phrases_tmp
    if config['previous_phrase']:
        o_prev_phrases = o_prev_phrases_tmp
    tweets_train_X = generate_placeholder(len(tweets_X))
    tweets_train_y = generate_placeholder(len(tweets_y))
    tweets_eval_test_X = generate_placeholder(len(tweets_X))
    tweets_eval_test_y = generate_placeholder(len(tweets_y))
    if config['big_test']:
        sss_1 = StratifiedShuffleSplit(n_splits=1,
                                       test_size=0.5,
                                       random_state=config['seed'])
        train_indexes, eval_test_indexes = next(
            sss_1.split(tweets_X[0], tweets_y[0]))
    elif not config['architecture'] == 'multi_label':
        sss_1 = StratifiedShuffleSplit(n_splits=1,
                                       test_size=0.3,
                                       random_state=config['seed'])
        train_indexes, eval_test_indexes = next(
            sss_1.split(tweets_X[0], tweets_y[0]))
    elif config['architecture'] == 'multi_label':
        sss_1 = IterativeStratification(
            n_splits=2,
            order=2,
            sample_distribution_per_fold=[0.3, 0.7],
            random_state=config['seed'])
        train_indexes, eval_test_indexes = next(
            sss_1.split(np.array(tweets_X[0]), np.array(tweets_y[0])))
    #for train_indexes, eval_test_indexes in sss_1.split(tweets_X[0], tweets_y[0]):
    np.savetxt(folder + '/statistics/train_indexes' + config['class'] + '.out',
               train_indexes,
               delimiter=',')
    np.savetxt(folder + '/statistics/eval_test_indexes' + config['class'] +
               '.out',
               eval_test_indexes,
               delimiter=',')
    for train_index in train_indexes:
        o_train_phrases.append(o_phrases[train_index])
        if config['previous_phrase']:
            o_prev_train_phrases.append(o_prev_phrases[train_index])
        for i in range(len(tweets_X)):
            tweets_train_X[i].append(tweets_X[i][train_index])
        for i in range(len(tweets_y)):
            tweets_train_y[i].append(tweets_y[i][train_index])
    for eval_test_index in eval_test_indexes:
        o_eval_test_phrases.append(o_phrases[eval_test_index])
        if config['previous_phrase']:
            o_prev_eval_test_phrases.append(o_prev_phrases[eval_test_index])
        for i in range(len(tweets_X)):
            tweets_eval_test_X[i].append(tweets_X[i][eval_test_index])
        for i in range(len(tweets_y)):
            tweets_eval_test_y[i].append(tweets_y[i][eval_test_index])
    print "Number of train indexes: " + str(len(train_indexes))
    print "Number of eval_test indexes: " + str(len(eval_test_indexes))
    print "Number of unique indexes after the stratifying", str(
        len(
            np.unique(
                np.concatenate((train_indexes, eval_test_indexes), axis=0))))
    tweets_eval_X = generate_placeholder(len(tweets_X))
    tweets_eval_y = generate_placeholder(len(tweets_y))
    tweets_test_X = generate_placeholder(len(tweets_X))
    tweets_test_y = generate_placeholder(len(tweets_y))
    """Split the 30% of the previous splitting into 50-50 (15-15) evaluation-test"""
    if config['architecture'] == 'multi_label':
        sss_2 = IterativeStratification(
            n_splits=2,
            order=2,
            sample_distribution_per_fold=[0.5, 0.5],
            random_state=config['seed'])
        eval_indexes, test_indexes = next(
            sss_2.split(np.array(tweets_eval_test_X[0]),
                        np.array(tweets_eval_test_y[0])))
    else:
        sss_2 = StratifiedShuffleSplit(n_splits=1,
                                       test_size=0.5,
                                       random_state=config['seed'])
        eval_indexes, test_indexes = next(
            sss_2.split(tweets_eval_test_X[0], tweets_eval_test_y[0]))
    #for eval_indexes, test_indexes in sss_2.split(tweets_eval_test_X[0], tweets_eval_test_y[0]):
    np.savetxt(folder + '/statistics/eval_indexes' + config['class'] + '.out',
               eval_indexes,
               delimiter=',')
    np.savetxt(folder + '/statistics/test_indexes' + config['class'] + '.out',
               test_indexes,
               delimiter=',')
    for eval_index in eval_indexes:
        for i in range(len(tweets_X)):
            tweets_eval_X[i].append(tweets_eval_test_X[i][eval_index])
        for i in range(len(tweets_y)):
            tweets_eval_y[i].append(tweets_eval_test_y[i][eval_index])
    for test_index in test_indexes:
        o_test_phrases.append(o_eval_test_phrases[test_index])
        if config['previous_phrase']:
            o_prev_test_phrases.append(o_prev_eval_test_phrases[test_index])
        for i in range(len(tweets_X)):
            tweets_test_X[i].append(tweets_eval_test_X[i][test_index])
        for i in range(len(tweets_y)):
            tweets_test_y[i].append(tweets_eval_test_y[i][test_index])
    #tweets_y = to_one_hot_encoding(tweets_y, get_classes_from_target_class(config['class']))
    if config['architecture'] != 'multi_label':
        tweets_train_y[0] = to_one_hot_encoding(
            tweets_train_y[0], get_classes_from_target_class(config['class']))
        tweets_eval_y[0] = to_one_hot_encoding(
            tweets_eval_y[0], get_classes_from_target_class(config['class']))
        tweets_test_y[0] = to_one_hot_encoding(
            tweets_test_y[0], get_classes_from_target_class(config['class']))
    print '------------------ PADRE LO DE LOS TUITS TRAIN -------------------------------'
    print tweets_train_X[0][0]
    print tweets_train_X[0][1]
    print tweets_train_X[0][2]
    print '------------------ PADRE LO DE LOS TUITS EVAL -------------------------------'
    print tweets_eval_X[0][0]
    print tweets_eval_X[0][1]
    print tweets_eval_X[0][2]
    print '------------------ PADRE LO DE LOS TUITS TESTS -------------------------------'
    print o_test_phrases[0]
    print o_test_phrases[1]
    print o_test_phrases[2]
    if config['big_test']:
        return tweets_eval_X, tweets_eval_y, tweets_test_X, tweets_test_y, tweets_train_X, tweets_train_y
    else:
        return tweets_train_X, tweets_train_y, tweets_eval_X, tweets_eval_y, tweets_test_X, tweets_test_y
class StackedPPB2(BaseEstimator, ClassifierMixin):
    """Stacked PPB2 model"""
    def __init__(self,
                 models=["morg2-nn+nb", "morg3-nn+nb"],
                 n_splits=5,
                 stack_method="predict_proba",
                 final_estimator=LogisticRegression(max_iter=1000),
                 n_proc=8,
                 passthrough=False):

        self.classifiers = [(model, PPB2(model=model, n_proc=n_proc))
                            for model in models]
        assert len(self.classifiers) == len(models)

        print(
            "building stacked PPB2 classifier",
            "using the following models:",
        )
        for model_name, classifier in self.classifiers:
            print(model_name, classifier)
        print()

        self.n_splits = n_splits
        assert stack_method in {"predict_proba", "predict"}
        self.stack_method = stack_method
        self.final_estimator = final_estimator
        self.n_proc = n_proc
        self.passthrough = passthrough
        if passthrough:
            raise NotImplementedError

    def fit(self, X, y):
        """
       
        """
        assert isinstance(X, pd.Series)

        assert y.any(axis=0).all(), "At least one positive example is needed"
        assert (1 -
                y).any(axis=0).all(), "At least one negative example is needed"

        print("Fitting meta-estimators using cross-validation")

        if len(y.shape) == 1:
            print("fitting in the single-target setting")
            self.multi_label = False
            self.split = StratifiedKFold(n_splits=self.n_splits)

            meta_preds = np.empty((
                X.shape[0],
                len(self.classifiers),
            ))

        else:
            print("fitting in the multi-target setting")
            print("number of targets:", y.shape[1])
            self.multi_label = True
            self.split = IterativeStratification(n_splits=self.n_splits,
                                                 order=1)
            self.n_targets = y.shape[1]

            meta_preds = np.empty((
                X.shape[0],
                len(self.classifiers),
                self.n_targets,
            ))

        for i, (name, classifier) in enumerate(self.classifiers):
            print("fitting classifier:", name)
            for split_no, (train, test) in enumerate(self.split.split(X, y)):
                print("processing split", split_no + 1, "/", self.n_splits)
                classifier.fit(X[train], y[train])
                if self.stack_method == "predict_proba":
                    meta_preds[test, i] = classifier.predict_proba(
                        X[test]
                    )  # multi target predict probs (for positive class)
                else:
                    meta_preds[test, i] = classifier.predict(
                        X[test])  # multi target predict
                print("completed split", split_no + 1, "/", self.n_splits)
                print()
            print(
                "completed classifier",
                name,
            )
            print()

        if not isinstance(y, np.ndarray):
            y = y.A

        if self.multi_label:
            print("fitting meta estimators")
            if not isinstance(self.final_estimator, list):
                self.final_estimator = [
                    clone(self.final_estimator) for _ in range(self.n_targets)
                ]

            for target_id in range(self.n_targets):
                with parallel_backend('threading', n_jobs=self.n_proc):
                    self.final_estimator[target_id].fit(
                        meta_preds[..., target_id], y[:, target_id])

                print("completed fitting meta estimator for target",
                      target_id + 1, "/", self.n_targets, "targets")
        else:

            print("fitting meta estimator")
            self.final_estimator.fit(meta_preds, y)

        print("completed fitting of meta estimator(s)")
        print()

        print("fitting base estimator(s) using full training set")
        for i, (name, classifier) in enumerate(self.classifiers):
            print("fitting classifier", name)
            classifier.fit(X, y)
            print(
                "completed classifier",
                name,
            )
            print()

        print()

        return self

    def predict(self, X):
        assert isinstance(X, pd.Series)
        # assert (X.dtype==pd.StringDtype()), "X should be a vector of smiles"

        if self.multi_label:
            meta_preds = np.empty((
                X.shape[0],
                len(self.classifiers),
                self.n_targets,
            ))
        else:
            meta_preds = np.empty((
                X.shape[0],
                len(self.classifiers),
            ))

        for i, (name, classifier) in enumerate(self.classifiers):
            print("performing prediction with classifier:", name)
            assert classifier.check_is_fitted()
            if self.stack_method == "predict_proba":
                meta_preds[:, i] = classifier.predict_proba(X)
            else:
                meta_preds[:, i] = classifier.predict(X)

        # final estimator
        if self.multi_label:
            final_pred = np.empty((X.shape[0], self.n_targets))
            for target_id in range(self.n_targets):
                check_is_fitted(self.final_estimator[target_id])
                with parallel_backend('threading', n_jobs=self.n_proc):
                    final_pred[:, target_id] = self.final_estimator[
                        target_id].predict(meta_preds[..., target_id])
            return final_pred
        else:
            check_is_fitted(self.final_estimator)
            return self.final_estimator.predict(meta_preds)

    def predict_proba(self, X):
        assert isinstance(X, pd.Series)
        # assert (X.dtype==pd.StringDtype()), "X should be a vector of smiles"

        if self.multi_label:
            meta_preds = np.empty((
                X.shape[0],
                len(self.classifiers),
                self.n_targets,
            ))
        else:
            meta_preds = np.empty((
                X.shape[0],
                len(self.classifiers),
            ))

        for i, (name, classifier) in enumerate(self.classifiers):
            assert classifier.check_is_fitted()
            if self.stack_method == "predict_proba":
                meta_preds[:, i] = classifier.predict_proba(X)
            else:
                meta_preds[:, i] = classifier.predict(X)

        # final estimator
        if self.multi_label:
            final_pred = np.empty((X.shape[0], self.n_targets))
            for target_id in range(self.n_targets):
                check_is_fitted(self.final_estimator[target_id])
                assert self.final_estimator[target_id].classes_[1]
                with parallel_backend('threading', n_jobs=self.n_proc):
                    final_pred[:, target_id] = self.final_estimator[
                        target_id].predict_proba(meta_preds[..., target_id])[:,
                                                                             1]
            return final_pred
        else:
            check_is_fitted(self.final_estimator)
            assert self.final_estimator.classes_[1]
            with parallel_backend('threading', n_jobs=self.n_proc):
                return self.final_estimator.predict_proba(meta_preds)[:, 1]

    def set_n_proc(self, n_proc):
        self.n_proc = n_proc
        for _, classifier in self.classifiers:
            classifier.set_n_proc(n_proc)
Exemple #14
0
tstdf = tstdf[['Image', 'Label']]
tstdf.drop_duplicates(inplace=True)
tstdf.to_csv(os.path.join(path_data, 'test.csv.gz'),
             index=False,
             compression='gzip')

# Some small EDA
trndf.columns
trndf.iloc[:, 1:].hist(figsize=(10, 10))
trndf['Image'].drop_duplicates().shape[0] == trndf['Image'].shape[0]
trndf.shape
tstdf.shape
trndf.head()
tstdf.head()

# http://scikit.ml/api/skmultilearn.model_selection.iterative_stratification.html
k_fold = IterativeStratification(n_splits=4, order=1, random_state=100)
splits = k_fold.split(trndf[['Image']], trndf.iloc[:, 1:])
folds = [trndf['Image'].iloc[x].tolist() for (x, y) in splits]

trndf['fold'] = 0
for t, f in enumerate(folds):
    trndf['fold'][~trndf.Image.isin(f)] = t
trndf.groupby('fold')[trndf.columns.tolist()[1:-1]].sum()

# Write out the training file
trndf.shape
trndf.to_csv(os.path.join(path_data, 'train.csv.gz'),
             index=False,
             compression='gzip')