Esempio n. 1
0
def multi_label_split(
    X: np.array, y: np.array
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray,
           np.ndarray]:
    X_train, y_train, X_test, y_test = iterative_train_test_split(X,
                                                                  y,
                                                                  test_size=1 -
                                                                  TRAIN_SIZE)
    X_valid, y_valid, X_test, y_test = iterative_train_test_split(
        X_test, y_test, test_size=TEST_SIZE / (TEST_SIZE + VALID_SIZE))
    return X_train, y_train, X_valid, y_valid, X_test, y_test
Esempio n. 2
0
def stratify_split_train_val_test(sids, labels):
    '''skmultilearn needs X as nsampel x ndim inputs'''
    np.random.seed(286501567)
    inputs = np.expand_dims(sids, axis=-1)
    from skmultilearn.model_selection import iterative_train_test_split
    X, y, X_test, y_test = iterative_train_test_split(inputs,
                                                      labels,
                                                      test_size=0.5)
    X_train, y_train, X_val, y_val = iterative_train_test_split(X,
                                                                y,
                                                                test_size=0.2)
    return X_train.squeeze(), X_val.squeeze(), X_test.squeeze()
Esempio n. 3
0
def stratifyval():
    labels = [
        'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly',
        'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia',
        'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
        'Fracture', 'Support Devices'
    ]

    totallabels = [
        'Path', 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly',
        'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia',
        'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
        'Fracture', 'Support Devices'
    ]

    df1 = pd.read_csv('train_tmp.csv')
    df2 = pd.read_csv('test_tmp.csv')
    df3 = pd.read_csv('val_tmp.csv')
    df = pd.concat([df1, df2, df3])

    totalX = df["Path"].values
    totalY = df[labels].values
    totalX = np.expand_dims(totalX, axis=1)

    print("PRE ITERATIVE")
    X_train, y_train, X_test, y_test = iterative_train_test_split(
        totalX, totalY, test_size=0.2)

    X_train, y_train, X_val, y_val = iterative_train_test_split(X_train,
                                                                y_train,
                                                                test_size=0.2)

    print("WRITING Train")
    dfTotal2 = pd.DataFrame(columns=totallabels)
    dfTotal2['Path'] = X_test.flatten()
    dfTotal2[labels] = y_test
    dfTotal2.to_csv("test_tmp2.csv")

    print("WRITING Train")
    dfTotal3 = pd.DataFrame(columns=totallabels)
    dfTotal3['Path'] = X_val.flatten()
    dfTotal3[labels] = y_val
    dfTotal3.to_csv("val_tmp2.csv")

    print("WRITING Test")
    dfTotal4 = pd.DataFrame(columns=totallabels)
    dfTotal4['Path'] = X_train.flatten()
    dfTotal4[labels] = y_train
    dfTotal4.to_csv("train_tmp2.csv")
Esempio n. 4
0
    def setup_wiki(self):

        mat = sio.loadmat(self.root / 'wiki' / 'POS.mat')

        self.metric = 'MicroF1'
        self.num_nodes = 4777
        self.num_classes = 40

        adj_t = mat['network'].tocoo()
        self.adj_t = SparseTensor(row=torch.LongTensor(adj_t.row),
                                  col=torch.LongTensor(adj_t.col),
                                  sparse_sizes=(self.num_nodes,
                                                self.num_nodes))

        if self.make_edge_index:
            row = self.adj_t.storage.row()
            col = self.adj_t.storage.col()
            self.edge_index = torch.stack((row, col), dim=0)

        self.y = torch.from_numpy(mat['group'].todense()).float()
        idx = torch.arange(self.y.shape[0]).view(-1, 1)
        train_idx, _, test_idx, _ = iterative_train_test_split(idx,
                                                               self.y,
                                                               test_size=0.1)
        self.split_idx = {
            'train': train_idx.view(-1),
            'valid': test_idx.view(-1),
            'test': test_idx.view(-1)
        }

        self.criterion = torch.nn.BCEWithLogitsLoss(
        )  # for multi-label classification
Esempio n. 5
0
def main():

    start = time.time()

    input_filename = "../data/frwiki_discussions_categories_processed.csv/part-00000-381f0f76-28b9-4da9-8cb0-96958b5ea46e-c000.csv"
    df = load_data(input_filename)
    print("Load took:", (time.time() - start))

    df = preprocess_data(df)

    encoder = MultiLabelBinarizer()
    labels_df = pd.DataFrame(encoder.fit_transform(df.categories.values))

    X_train, y_train, X_test, y_test = iterative_train_test_split(
        df.text.values.reshape(-1, 1), labels_df.values, test_size=0.5)

    train_dataset = make_dataset(X_train, y_train)
    test_dataset = make_dataset(X_test, y_test)

    model = compile_model(verbose=True)
    model, history = train(model, train_dataset, test_dataset, lr=1e-3)

    output_dir = "../tf_model/frwikipedia_10_categories_classifier"

    model.save(output_dir)

    end = time.time()
    print("Complete training took :", (end - start))
Esempio n. 6
0
def run_test3(normas, n_jobs=1):
    # Corpus e labels:
    corpus = [norma['TextoPreProcessado'] for norma in normas]
    labels = [norma['AssuntoGeral'] for norma in normas]

    # Obtém X e y:
    X = corpus
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(labels)

    # Faz shuffle:
    X, y = shuffle(X, y, random_state=42)

    # Vectorizer
    X = TfidfVectorizer(min_df=20, max_df=0.5).fit_transform(X).toarray()

    # TrainTestSplit:
    X_train, y_train, X_test, y_test = iterative_train_test_split(
        X, y, test_size=0.5)

    # Classifcador:
    clf = MLPClassifier(hidden_layer_sizes=(150), activation='relu')
    clf.fit(X_train, y_train)

    # Prevê
    y_pred = clf.predict(X_test)
def split_data(inkml_data):
    print("split data")
    uis_to_symbols = create_ui_symbol_map(inkml_data)
    symbol_index_map, size = create_symbol_index_map(uis_to_symbols)
    print("size:", size)
    array = np.zeros(size)
    uis = []
    for ui in uis_to_symbols:
        print(ui)
        indices = np.zeros(size)
        uis.append(ui)
        for symbol in uis_to_symbols[ui]:
            indices[symbol_index_map[symbol]] = uis_to_symbols[ui][symbol]
        array = np.vstack((array, indices))
    array = np.delete(array, (0), axis=0)
    le = pp.LabelEncoder()
    encoded_uis = le.fit_transform(uis)
    x1 = np.zeros(len(encoded_uis))
    print(len(encoded_uis))
    x = np.column_stack((np.array(encoded_uis), x1))
    print(array.shape)
    x_train, y_train, x_test, y_test = iterative_train_test_split(x, array, test_size=(1 / 3))

    x_train = x_train[:, 0].astype(int)
    x_test = x_test[:, 0].astype(int)

    training_ids = le.inverse_transform(x_train)
    testing_ids = le.inverse_transform(x_test)
    print(type(training_ids), type(testing_ids))
    return set(training_ids.tolist()), set(testing_ids.tolist())
Esempio n. 8
0
def ECC_test_2_fold(data, label, random_state=3071980, ensemble=5):
    # data set information
    n_label = label.shape[1]

    # split training and test data set
    X_train, y_train, X_test, y_test = iterative_train_test_split(
        np.matrix(data), np.matrix(label), test_size=0.5)

    X_train = pd.DataFrame(X_train, columns=data.columns)
    X_test = pd.DataFrame(X_test, columns=data.columns)

    y_train = pd.DataFrame(y_train, columns=label.columns)
    y_test = pd.DataFrame(y_test, columns=label.columns)

    performance_df_all = pd.DataFrame()

    for j in range(2):
        X_test, X_train = X_train, X_test
        y_test, y_train = y_train, y_test

        # ensemble
        y_pred_ensemble = pd.DataFrame(np.zeros(y_test.shape),
                                       columns=y_test.columns,
                                       index=y_test.index)
        y_prob_ensemble = pd.DataFrame(np.zeros(y_test.shape),
                                       columns=y_test.columns,
                                       index=y_test.index)
        for i in range(ensemble):
            # training
            # print("--- start training ---\n")
            classifier_list, training_time, order = naiveBayes_multi_label_training(
                X_train, y_train)

            # testing
            # print("--- start testing ---\n")
            y_predict, y_prob, testing_time = naiveBayes_multi_label_testing(
                X_test, n_label, classifier_list, order)

            y_predict.columns = label.columns[order]
            y_prob.columns = label.columns[order]
            y_predict = y_predict[label.columns]
            y_prob = y_prob[label.columns]

            y_pred_ensemble = y_pred_ensemble + y_predict
            y_prob_ensemble = y_prob_ensemble + y_prob

        y_pred_ensemble = (((y_pred_ensemble / ensemble) >= 0.5) *
                           1).astype('int')
        y_prob_ensemble = y_prob_ensemble / ensemble

        # evaluation
        performance = evaluation(y_pred_ensemble, y_prob_ensemble, y_test)
        performance_df = pd.DataFrame.from_dict(performance, orient='index')

        #performance_df_all.index = performance_df.index

        performance_df_all = pd.concat([performance_df_all, performance_df],
                                       axis=1)

    return performance_df_all
Esempio n. 9
0
def BR_test(data, label, dataPath, random_state=3071980):
    # data set information
    n_label = label.shape[1]
    n_attr = data.shape[1]
    n_instance = data.shape[0]
    avg_label_per_instance = label.sum(axis=1).mean()

    # split training and test data set
    X_train, y_train, X_test, y_test = iterative_train_test_split(
        np.matrix(data), np.matrix(label), test_size=0.5)

    X_train = pd.DataFrame(X_train, columns=data.columns)
    X_test = pd.DataFrame(X_test, columns=data.columns)

    y_train = pd.DataFrame(y_train, columns=label.columns)
    y_test = pd.DataFrame(y_test, columns=label.columns)

    # training
    classifier_list, training_time = naiveBayes_multi_label_training_BR(
        X_train, y_train)

    # testing
    y_predict, y_prob, testing_time = naiveBayes_multi_label_testing_BR(
        X_test, n_label, classifier_list)

    y_predict.columns = label.columns
    return y_predict, y_test
def main():
    if len(sys.argv) == 3:
        database_filepath, model_filepath = sys.argv[1:]
        print('Loading data...\n    DATABASE: {}'.format(database_filepath))
        X, Y, category_names = load_data(database_filepath)
        # This part of the script is modified from the original in order to use
        # the iterative_train_test_split function from skmultilearn
        X = X.values
        X = X[:, np.newaxis]

        X_train, Y_train, X_test, Y_test = iterative_train_test_split(
            X, np.array(Y), test_size=0.3)

        X_train = np.squeeze(X_train)
        X_test = np.squeeze(X_test)

        print('Building model...')
        model = build_model()

        print('Training model...')
        model.fit(X_train, Y_train)

        print('Evaluating model...')
        evaluate_model(model, X_test, Y_test, category_names)

        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')

    else:
        print('Please provide the filepath of the disaster messages database '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second argument. \n\nExample: python '\
              'train_classifier.py ../data/DisasterResponse.db classifier.pkl')
Esempio n. 11
0
def train_and_predict(X, y, train_ratio=0.2, n_trials=10, random_state=None):
    micro, macro, c, std, f1, f1_std = [], [], [], [], [], []
    for i in range(n_trials):
        np.random.seed(random_state)
        X_train, y_train, X_test, y_test = iterative_train_test_split(
            X, y, test_size=1 - train_ratio)
        clf = MultiOutputClassifier(
            LogisticRegressionCV(max_iter=1e4, class_weight='balanced'))
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            clf.fit(X_train, y_train.A)
            y_pred = np.array(clf.predict_proba(X_test))[:, :, 1].T
            mi = roc_auc_score(y_test.A, y_pred, average="micro")
            ma = roc_auc_score(y_test.A, y_pred, average="macro")
            y_pred = clf.predict(X_test)
            f = f1_score(y_test.A, y_pred, average="micro")
        std.append(mi)
        f1.append(f)
        f1_std.append(f)
        micro.append(mi)
        macro.append(ma)
        c.append(
            np.mean([estimator.C_.mean() for estimator in clf.estimators_]))
    return np.mean(micro), np.mean(macro), np.mean(c), np.std(std), np.mean(
        f1), np.std(f1_std)
Esempio n. 12
0
def load_and_split(data, dev_size):
    """Loads a data file and uses stratified sampling to create splits."""
    samples, ids, tags = dict(), [], []
    with open(data, "r") as f:
        for line in f:
            sample = json.loads(line.strip())
            samples[sample["id"]] = sample
            ids.append(sample["id"])
            tags.append(sample["tags"])

    tag2id = utils.get_tag_mappings(tags)
    id2tag = {v: k for k, v in tag2id.items()}

    tags = np.array([utils.tags_to_onehot(tag, tag2id) for tag in tags])
    ids = np.array(ids).reshape(-1, 1)

    train_ids, _, dev_ids, _ = iterative_train_test_split(ids, tags, dev_size)

    with open("train.tmp", "w") as f:
        for id in train_ids:
            f.write(f"{json.dumps(samples[id.item()], ensure_ascii=False)}\n")

    with open("dev.tmp", "w") as f:
        for id in dev_ids:
            f.write(f"{json.dumps(samples[id.item()], ensure_ascii=False)}\n")

    dataset = load_dataset('json',
                           data_files={
                               "train": "train.tmp",
                               "dev": "dev.tmp"
                           })

    return dataset, tag2id, id2tag
Esempio n. 13
0
def two_fold(methods,
             data,
             label,
             dataset,
             ensemble=1,
             ordering="random",
             structure="bayes_net",
             lead=False):
    # setup
    savePath = "../code/temp/" + methods.__name__ + "/" + dataset + "/"
    if not os.path.exists(savePath):
        os.makedirs(savePath)

    print("running", methods.__name__)
    print("setting:", ensemble, ordering, structure, lead)
    performance_df_all = pd.DataFrame()
    if label.shape[1] >= 100:
        time = 20
    else:
        time = 20
    for j in range(time):
        print("time:", j)
        X_train, y_train, X_test, y_test = iterative_train_test_split(
            np.matrix(data), np.matrix(label), test_size=0.5)
        X_train = pd.DataFrame(X_train, columns=data.columns)
        X_test = pd.DataFrame(X_test, columns=data.columns)
        y_train = pd.DataFrame(y_train, columns=label.columns)
        y_test = pd.DataFrame(y_test, columns=label.columns)
        """
        X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.5)
        X_train.reset_index(inplace=True, drop=True)
        X_test.reset_index(inplace=True, drop=True)
        y_train.reset_index(inplace=True, drop=True)
        y_test.reset_index(inplace=True, drop=True)"""
        for i in range(2):
            X_test, X_train = X_train, X_test
            y_test, y_train = y_train, y_test

            # test
            if methods.__name__ == "BayesianClassifierChain_NB":
                pred_ensemble, prob_ensemble = BayesianClassifierChain_NB(
                    X_train, X_test, y_train, y_test, savePath, ensemble,
                    ordering, structure, lead)
            elif methods.__name__ == "ClassifierChain_NB":
                pred_ensemble, prob_ensemble = ClassifierChain_NB(
                    X_train, X_test, y_train, y_test, savePath, ensemble)

            else:
                raise BaseException("no such a function")

            performance = evaluation(pred_ensemble, prob_ensemble, y_test)
            performance_df = pd.DataFrame.from_dict(performance,
                                                    orient='index')
            performance_df_all = pd.concat(
                [performance_df_all, performance_df], axis=1)
    performance_df_all.columns = list(range(time * 2))
    return performance_df_all
Esempio n. 14
0
def multilearn_iterative_train_test_split(features, labels, cols, test_size=0.3):
    from skmultilearn.model_selection import iterative_train_test_split

    train_features, train_labels, test_features, test_labels = iterative_train_test_split(
        np.array(features), labels, test_size=test_size)
    # print(type(train_features))
    train_features = pd.DataFrame(train_features, columns=cols)
    test_features = pd.DataFrame(test_features, columns=cols)
    return train_features, test_features, train_labels, test_labels
Esempio n. 15
0
def split_train_test(df, train_size=70, test_size=30, folder='../data/action_db'):
    print('Splitting into train-test: ' + str(train_size) + '-' + str(test_size))
    def _convert_index_to_subj_emotion(y):
        print(y)
        y = [subjects[y[0]-1], EMOTIONS[y[1]]]
        return y

    if 'paco' not in folder:
        EMOTIONS = ['ang', 'fea', 'hap', 'neu', 'sad', 'unt']
        subjects = ['1m', '2f', '3m', '4f', '5m', '6f', '7f', '8m', '9f', '10f', '11f', '12m', '13f', '14f', '15m', '16f',
                    '17f', '18f', '19m', '20f', '21m', '22f', '23f', '24f', '25m', '26f', '27m', '28f', '29f']
        subjects = ['0'*(4-len(subject)) + subject for subject in subjects]
    else:
        # emotions = ['ang', 'fea', 'hap', 'sad', 'neu']
        EMOTIONS = ['ang', 'hap', 'sad', 'neu']
        subjects = ['ale', 'ali', 'alx', 'amc', 'bar', 'boo', 'chr', 'dav', 'din', 'dun', 'ele', 'emm', 'gra', 'ian', 'jan', 'jen', 'jua', 'kat', 'lin', 'mac', 'mar', 'mil', 'ndy', 'pet', 'rac', 'ros', 'she', 'shi', 'ste', 'vas']

    df = df.reindex(np.random.permutation(df.index))

    emotions_dict = {EMOTIONS[i]: int(i) for i in range(len(EMOTIONS))}
    subjects_dict = {subjects[i]: int(i + 1) for i in range(len(subjects))}
    y_classes = ['subject', 'emotion']
    y = df[y_classes]

    y['subject'] = y['subject'].map(subjects_dict)
    y['emotion'] = y['emotion'].map(emotions_dict)
    y = np.array(y)

    X = df.drop(y_classes, axis=1)
    columns = X.columns.values
    X = np.array(X)
    columns = np.append(columns, y_classes)

    assert len(X) == len(y)

    X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size=test_size/(test_size+train_size))
    y_train = np.apply_along_axis(_convert_index_to_subj_emotion, 1, y_train)
    y_test = np.apply_along_axis(_convert_index_to_subj_emotion, 1, y_test)

    train_df = pd.DataFrame(data=np.hstack((X_train, y_train)), columns=columns)
    test_df = pd.DataFrame(data=np.hstack((X_test, y_test)), columns=columns)

    print('Counts in train set')
    for emotion in EMOTIONS:
        print(emotion + ": " + str(len(train_df[train_df['emotion'] == emotion])))
    print('Counts in test set')
    for emotion in EMOTIONS:
        print(emotion + ": " + str(len(test_df[test_df['emotion'] == emotion])))

    print('Saving training data...')
    train_df.to_csv(folder + '/training/train_data.csv')
    # train_df.to_hdf(folder + '/training/train_data.h5', key='df', mode='w')
    print('Saving test data...')
    test_df.to_hdf(folder + '/test/test_data.h5', key='df', mode='w')
    print('Done.')
    return train_df, test_df
Esempio n. 16
0
def train_test_split(df, df_true, test_size=0.25):
    # iterative_train_test_split is only deterministic if we call this first
    np.random.seed(RANDOM_SEED)
    # iterative_train_test_split expects a matrix, whereas CountVectorizer
    # needs an iterable over strings
    X_train, y_train, X_test, y_test = iterative_train_test_split(
        df.text.to_frame().values, df_true.values, test_size=0.25)
    X_train, X_test = X_train[:, 0], X_test[:, 0]

    return X_train, y_train, X_test, y_test
 def split_data(self):
     print(
         'Splitting data into training and validation set to train DNNs...')
     X_train, y_train, X_val, y_val = iterative_train_test_split(
         self._X_train, self._y_train, test_size=0.35)
     self._X_train = X_train
     self._y_train = y_train
     self._X_val = X_val
     self._y_val = y_val
     print('Train data:', X_train.shape)
     print('Train labels:', y_train.shape)
     print('Val data:', X_val.shape)
     print('Val labels:', y_val.shape)
Esempio n. 18
0
def stratify():
    labels = [
        'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly',
        'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia',
        'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
        'Fracture', 'Support Devices'
    ]

    totallabels = [
        'Path', 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly',
        'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia',
        'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
        'Fracture', 'Support Devices'
    ]

    df = pd.read_csv('mimic_chex.csv')
    totalX = df["Path"].values
    totalY = df[labels].values
    totalX = np.expand_dims(totalX, axis=1)

    print("PRE ITERATIVE")
    X_train, y_train, X_test, y_test = iterative_train_test_split(
        totalX, totalY, 0.2)

    print(X_train.shape)
    print("COMBINATION")
    df = pd.DataFrame({
        'train':
        Counter(
            str(combination)
            for row in get_combination_wise_output_matrix(y_train, order=2)
            for combination in row),
        'test':
        Counter(
            str(combination)
            for row in get_combination_wise_output_matrix(y_test, order=2)
            for combination in row)
    }).T.fillna(0.0)
    print(df.to_string())

    print("WRITING Train")
    dfTotal2 = pd.DataFrame(columns=totallabels)
    dfTotal2['Path'] = X_train.flatten()
    dfTotal2[labels] = y_train
    dfTotal2.to_csv("train_draft.csv")

    print("WRITING Test")
    dfTotal2 = pd.DataFrame(columns=totallabels)
    dfTotal2['Path'] = X_test.flatten()
    dfTotal2[labels] = y_test
    dfTotal2.to_csv("test.csv")
Esempio n. 19
0
def stratify():
    df = pd.read_csv(PATH_RADIO6, usecols=['TEXT', 'No Finding', 'Enlarged Cardiomediastinum',
                                           'Cardiomegaly', 'Airspace Opacity','Lung Lesion',
                                           'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
                                           'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
                                           'Fracture', 'Support Devices'], engine='python' )
    totalX = df["TEXT"].values
    totalY = df[['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Airspace Opacity',
                 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis','Pneumothorax',
                 'Pleural Effusion', 'Pleural Other','Fracture', 'Support Devices']].values

    totalX = np.expand_dims(totalX, axis=1)

    print("PRE ITERATIVE")
    X_train, y_train, X_test, y_test = iterative_train_test_split(totalX, totalY, 0.2)


    print("COMBINATION")
    df = pd.DataFrame({
        'train': Counter(
            str(combination) for row in get_combination_wise_output_matrix(y_train, order=2)
            for
            combination in row),
        'test': Counter(
            str(combination) for row in get_combination_wise_output_matrix(y_test, order=2) for
            combination in row)
    }).T.fillna(0.0)
    print(df.to_string())

    print("WRITING Train")

    dfTotal2 = pd.DataFrame(columns=["TEXT", 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Airspace Opacity',
                 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis','Pneumothorax',
                 'Pleural Effusion', 'Pleural Other','Fracture', 'Support Devices'])
    dfTotal2['TEXT'] = X_train.flatten()
    dfTotal2[['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Airspace Opacity',
                 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis','Pneumothorax',
                 'Pleural Effusion', 'Pleural Other','Fracture', 'Support Devices']] = y_train
    dfTotal2.to_csv("train.csv")

    print("WRITING Test")

    dfTotal2 = pd.DataFrame(columns=["TEXT", 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Airspace Opacity',
                 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis','Pneumothorax',
                 'Pleural Effusion', 'Pleural Other','Fracture', 'Support Devices'])
    dfTotal2['TEXT'] = X_test.flatten()
    dfTotal2[['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Airspace Opacity',
                 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis','Pneumothorax',
                 'Pleural Effusion', 'Pleural Other','Fracture', 'Support Devices']] = y_test
    dfTotal2.to_csv("test.csv")
Esempio n. 20
0
def fragment_train_test_split(data: list, labels: list, test_size=0.2, shuffle=True):
    """Treating the data as multilabel to do iterative train_test_split

    arguments:
      data: list of objects from the `Fragment` dataclass
      labels: list of used labels
      text_size: part of the data to be used as `test` set
      shuffle: if True, shuffle the dataset before splitting.
    """
    def label_to_int(l, labels):
        return labels.index(l)

    if shuffle:
        random.shuffle(data)

    # For train_test_split we use an X containing only the index of the data items
    X = [[index] for index in range(len(data))]
    y = []
    num_labels = len(labels)
    # One hot encoding of multiple categories
    for d in data:
        label_list = list({label_to_int(f.label, labels) for f in d['fragments']})
        y.append([1 if l in label_list else 0 for l in range(num_labels)])

    nX = np.array(X)
    ny = np.array(y)

    # We use iteractive_train_test_split to split with an even division for
    # labels.
    #
    # NOTE!
    # multilearn returns for iterative_train_test_split are:
    #   X_train, y_train, X_test, y_test
    # unlike sklearn train_test_split which returns
    #   X_train, X_test, y_train, y_test
    X_train, _, X_test, _ = iterative_train_test_split(nX, ny, test_size=test_size)
    X_train = np.squeeze(X_train)
    X_test = np.squeeze(X_test)
    print("Train:", X_train.shape, "Test:", X_test.shape)

    train = []
    test = []
    for key in list(X_train):
        train.append(data[key])
    for key in list(X_test):
        test.append(data[key])

    return train, test
    def splitting_data(self):
        '''
        A dropout layer for sparse input data, note that this layer
        can not be applied to the output of SparseInputDenseLayer
        because the output of SparseInputDenseLayer is dense.
        '''
        data_random = self.data.sample(frac=1)

        X = data_random[['guid', 'txt']].to_numpy()
        Y = data_random[[
            'Safety', 'CleanlinessView', 'Information', 'Service', 'Comfort',
            'PersonnelCard', 'Additional'
        ]].to_numpy()

        self.X_train, self.y_train, self.X_test, self.y_test = iterative_train_test_split(
            X, Y, test_size=self.test_size)
        return self.X_train, self.y_train, self.X_test, self.y_test
Esempio n. 22
0
def Iterative_Stratifier_Split(df, ratio=0.15):
    img_ids = df.image_id.unique()
    print("Creating one-hot labels ...")
    labels = np.zeros((len(img_ids), len(df.columns) - 1), dtype=np.uint8)
    for i, img_id in enumerate(tqdm(img_ids)):
        # for i, img_id in enumerate(img_ids):
        aa = df.loc[df.image_id == img_id, :].to_numpy()[0, 1:]
        # print(aa.shape)
        labels[i] = aa
    print("Done!")
    print("Spliting dataset ...")
    train_image_id, train_class_id, test_image_id, test_class_id = iterative_train_test_split(
        img_ids.reshape(-1, 1), labels, test_size=ratio)
    train_df = df[df.image_id.map(lambda x: x in train_image_id.reshape(-1))]
    test_df = df[df.image_id.map(lambda x: x in test_image_id.reshape(-1))]
    print("Done!\n")
    return train_df, test_df
def test_stratified_split():
    y = np.array([[0, 1], [0, 3], [1, 3], [4, 5], [4, 3], [4, 4], [4, 4]])
    X = np.array([[i, i + 1] for i in range(len(y))])
    assert len(X) == len(y)
    classes, y_indices = np.unique(y, return_inverse=True)
    n_classes = classes.shape[0]

    class_counts = np.bincount(y_indices)
    X_train, y_train, X_test, y_test = iterative_train_test_split(
        X, y, test_size=0.5)
    print('X_train')
    print(X_train)
    print('y_train')
    print(y_train)
    print('X_test')
    print(X_test)
    print('y_test')
    print(y_test)
def multilayer_sample(edges: pd.DataFrame,
                      layer_ids: List[int],
                      hidden_ratio: float = 0.5,
                      random_state: Optional[int] = None) -> MultiLayerSplit:
    """
    Split multilayer network into hidden and observed parts
    for specified layers. First, split nodes at random, then
    split edges accordingly.
    
    Usage example:   
    
    >>> from fao_data import load_all_layers
    >>> edges = load_all_layers()
    >>> sample = multilayer_sample(edges, [42, 123], random_state=0)
    >>> sample.print_summary()
    Summary of random split. Layer ids: [42, 123]
               total  observed  hidden  obs.ratio
    nodes        136        67      69   0.492647
    edges       1034       248     786   0.239845
    layer 42     773       179     594   0.231565
    layer 123    261        69     192   0.264368
    """
    
    edges = filter_by_layer(edges, layer_ids)
    nodes = sorted(node_set(edges))
    node_layers = _node_layer_incidence(edges, nodes, layer_ids)
    np.random.seed(random_state)
    nodes_observed, _, nodes_hidden, _ = \
        iterative_train_test_split(np.array(nodes).reshape(-1, 1),
                                   node_layers,
                                   test_size=hidden_ratio)
    nodes_observed = nodes_observed.flatten().tolist()
    nodes_hidden = nodes_hidden.flatten().tolist()
    edges_observed, edges_hidden = partition_into_observed_and_hidden(edges, nodes_hidden)
    split = MultiLayerSplit(
        layer_ids=layer_ids,
        node_index=index_elements(nodes),
        observed=GraphData(edges_observed, nodes_observed),
        hidden=GraphData(edges_hidden, nodes_hidden),
        full=GraphData(edges, nodes)
    )
    return split
def main():
    data_df = pd.read_csv('./data/train.csv')

    defects_df = []
    for i in range(0, len(data_df), 4):
        defi = {}
        defi['ImageId_ClassId'] = data_df.loc[i, 'ImageId_ClassId'][:-2]
        defi['1'] = int(not pd.isnull(data_df.loc[i, 'EncodedPixels']))
        defi['2'] = int(not pd.isnull(data_df.loc[i + 1, 'EncodedPixels']))
        defi['3'] = int(not pd.isnull(data_df.loc[i + 2, 'EncodedPixels']))
        defi['4'] = int(not pd.isnull(data_df.loc[i + 3, 'EncodedPixels']))
        defects_df.append(defi)
    defects_df = pd.DataFrame(defects_df)[[
        'ImageId_ClassId', '1', '2', '3', '4'
    ]]
    # defects_df.to_csv('./data/defect_types.csv', index=False)

    Xd = np.expand_dims(np.array(range(len(defects_df))), 1)
    X_train, y_train, X_test, y_test = iterative_train_test_split(
        Xd, defects_df[['1', '2', '3', '4']].to_numpy(), test_size=0.2)

    print("Train set size = {}, Test set size = {}".format(
        len(X_train), len(X_test)))

    print(
        pd.DataFrame({
            'train':
            Counter(
                str(combination)
                for row in get_combination_wise_output_matrix(y_train, order=2)
                for combination in row),
            'test':
            Counter(
                str(combination)
                for row in get_combination_wise_output_matrix(y_test, order=2)
                for combination in row)
        }).T.fillna(0.0))

    defects_df['is_valid'] = False
    defects_df.loc[X_test[:, 0], 'is_valid'] = True
    defects_df.to_csv('./data/split.csv', index=False)
Esempio n. 26
0
 def train_test_split(self,meth="random",prob=0.8,nf=5):
     if(meth=="stratified"):  ##Use scikit-multilearn
         train, _, test, _ = iterative_train_test_split(np.arange(0,self.n).reshape(self.n,1), self.targets.values, test_size = 1-prob)
         self.idx_train=train[:,0]
         self.idx_test=test[:,0]
     elif(meth=="random"):
         s=np.random.choice(a=2,size=self.n,p=[prob,1-prob])
         self.idx_train=np.where(s==0)[0]
         self.idx_test=np.where(s==1)[0]
     elif(meth=="kfold"):
         indices=np.arange(self.n)
         np.random.shuffle(indices)
         foldsize=int(np.round(self.n/nf))
         for i in range(nf):
             self.kfolds.append({'fold':i+1,
                                 'idx_test':indices[np.arange(start=i*foldsize,stop=(i+1)*foldsize).tolist()],
                                 'idx_train':indices[np.arange(start=0,stop=i*foldsize).tolist()+np.arange(start=(i+1)*foldsize,stop=self.n).tolist()]
                     })
     elif(meth=="bootstrap"):
         indices=np.arange(self.n)
     else: ###do not split
         self.idx_train=np.arange(0,self.n)
         self.idx_test=np.arange(0,self.n)
Esempio n. 27
0
def split_train_cv(
        data_frame: pd.DataFrame,
        frac: float = 0.9,
        y=None,  # Only for stratified, computes necessary split
        **kwargs):
    """split_train_cv

    :param data_frame:
    :type data_frame: pd.DataFrame
    :param frac:
    :type frac: float
    """
    if kwargs.get('mode',
                  None) == 'urbansed':  # Filenames are DATA_-1 DATA_-2 etc
        data_frame.loc[:, 'id'] = data_frame.groupby(
            data_frame['filename'].str.split('_').apply(
                lambda x: '_'.join(x[:-1]))).ngroup()
        sampler = np.random.permutation(data_frame['id'].nunique())
        num_train = int(frac * len(sampler))
        train_indexes = sampler[:num_train]
        cv_indexes = sampler[num_train:]
        train_data = data_frame[data_frame['id'].isin(train_indexes)]
        cv_data = data_frame[data_frame['id'].isin(cv_indexes)]
        del train_data['id']
        del cv_data['id']
    elif kwargs.get('mode', None) == 'stratified':
        # Use statified sampling
        from skmultilearn.model_selection import iterative_train_test_split
        index_train, _, index_cv, _ = iterative_train_test_split(
            data_frame.index.values.reshape(-1, 1), y, test_size=1. - frac)
        train_data = data_frame[data_frame.index.isin(index_train.squeeze())]
        cv_data = data_frame[data_frame.index.isin(index_cv.squeeze())]
    else:
        # Simply split train_test
        train_data = data_frame.sample(frac=frac, random_state=10)
        cv_data = data_frame[~data_frame.index.isin(train_data.index)]
    return train_data, cv_data
Esempio n. 28
0
    def run_exam(self):
        log_dir = os.path.join(
            self.config.tb_path,
            datetime.datetime.now().strftime("%Y%m%d-%H%M%s"))
        tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

        adapter_size = None  # use None to fine-tune all of BERT
        model = create_model(self.config, adapter_size=adapter_size)
        X_train, y_train, X_test, y_test = iterative_train_test_split(
            self.train_x, self.train_y, test_size=self.config.test_ratio)

        if self.config.load_model_name:
            model.load_weights(
                os.path.join(self.config.epoch_model_path,
                             self.config.load_model_name))

        model.fit(x=X_train,
                  y=y_train,
                  validation_data=(X_test, y_test),
                  batch_size=self.config.batch_size,
                  shuffle=True,
                  epochs=self.config.num_epochs,
                  initial_epoch=self.config.initial_epoch,
                  callbacks=[
                      create_learning_rate_scheduler(
                          max_learn_rate=1e-5,
                          end_learn_rate=1e-7,
                          warmup_epoch_count=self.config.warmup_epoch_count,
                          total_epoch_count=self.config.num_epochs),
                      tensorboard_callback,
                      MyCustomCallback(self.config)
                  ])
        model.save_weights(os.path.join(self.config.epoch_model_path,
                                        'sentiments.h5'),
                           overwrite=True)
        self.bot.send_msg('{} train is done'.format(self.config.train_name))
def stratify_val():
    df = pd.read_csv(
        "physionet.org/files/mimic-cxr-jpg/2.0.0/train_multi2_v3.csv",
        usecols=[
            'Path_compr', 'Indication', 'Impression', 'Findings', 'No Finding',
            'Enlarged '
            'Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion',
            'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
            'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture',
            'Support Devices'
        ])

    totalX = df[['Path_compr', 'Indication', 'Impression', 'Findings']].values
    totalY = df[[
        'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly',
        'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia',
        'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
        'Fracture', 'Support Devices'
    ]].values

    print(totalX.shape)
    print(totalY.shape)

    totalX = np.expand_dims(totalX, axis=1)

    print("PRE ITERATIVE")
    X_train, y_train, X_test, y_test = iterative_train_test_split(
        totalX, totalY, 0.2)

    print("COMBINATION")
    df = pd.DataFrame({
        'train':
        Counter(
            str(combination)
            for row in get_combination_wise_output_matrix(y_train, order=2)
            for combination in row),
        'test':
        Counter(
            str(combination)
            for row in get_combination_wise_output_matrix(y_test, order=2)
            for combination in row)
    }).T.fillna(0.0)
    print(df.to_string())

    X_train = np.squeeze(X_train, axis=1)
    X_test = np.squeeze(X_test, axis=1)

    print(X_train.shape)
    print(y_train.shape)
    print(X_test.shape)
    print(y_test.shape)

    print("WRITING Train")

    dfTotal2 = pd.DataFrame(columns=[
        'Path_compr', 'Indication', 'Impression', 'Findings', 'No Finding',
        'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity',
        'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
        'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture',
        'Support Devices'
    ])
    print(dfTotal2.shape)
    dfTotal2[['Path_compr', 'Indication', 'Impression',
              'Findings']] = pd.DataFrame(X_train)
    dfTotal2[[
        'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly',
        'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia',
        'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
        'Fracture', 'Support Devices'
    ]] = y_train

    with open("physionet.org/files/mimic-cxr-jpg/2.0.0/train_multi_v3.csv",
              mode='w',
              newline='\n') as f:
        dfTotal2.to_csv(f,
                        sep=",",
                        float_format='%.2f',
                        index=False,
                        line_terminator='\n',
                        encoding='utf-8')

    print("WRITING Test")

    dfTotal2 = pd.DataFrame(columns=[
        'Path_compr', 'Indication', 'Impression', 'Findings', 'No Finding',
        'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity',
        'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
        'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture',
        'Support Devices'
    ])
    dfTotal2[['Path_compr', 'Indication', 'Impression',
              'Findings']] = pd.DataFrame(X_test)
    dfTotal2[[
        'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly',
        'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia',
        'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
        'Fracture', 'Support Devices'
    ]] = y_test
    with open("physionet.org/files/mimic-cxr-jpg/2.0.0/val_multi_v3.csv",
              mode='w',
              newline='\n') as f:
        dfTotal2.to_csv(f,
                        sep=",",
                        float_format='%.2f',
                        index=False,
                        line_terminator='\n',
                        encoding='utf-8')
Esempio n. 30
0
    def split(self):
        """Split a dataset into training and testing (or validation)"""
        import argparse, sys
        parser = self.new_parser('split')
        # prefixing the argument with -- means it's optional
        parser.add_argument('input',
                            type=str,
                            help='Path to input image HDF5 file')
        parser.add_argument('train_output',
                            type=str,
                            help='Path to output HDF5 file (training)')
        parser.add_argument('test_output',
                            type=str,
                            help='Path to output HDF5 file (testing)')
        parser.add_argument(
            '--h5keys',
            default='images,labels',
            help='Name of datasets in input and HDF5 files (comma-separated)')
        parser.add_argument(
            '--copy_other_keys',
            action='store_true',
            help='Copy all other keys from input file into output verbatim')
        parser.add_argument('--random_seed',
                            default=0,
                            type=int,
                            help='Random seed to use for determining split')
        parser.add_argument(
            '--test_size',
            default=0.25,
            help=
            'Size of test size. If <= 1, proportion of dataset to use. Otherwise number of samples.'
        )
        parser.add_argument('--stratify_key',
                            default=None,
                            help='Key to use for stratification labels')
        args = parser.parse_args(sys.argv[2:])

        keys = args.h5keys.split(',')

        test_size = float(args.test_size)
        if test_size > 1:  # if not a proportion, should be an integer
            test_size = int(args.test_size)

        dataset = H5Dataset(args.input, key=keys)

        stratify = None
        if args.stratify_key is not None:
            # load all the labels
            with h5py.File(args.input, 'r') as f:
                stratify = np.array(f[args.stratify_key])
            if len(stratify.shape) == 2:
                if stratify.shape[1] == 1:
                    stratify = stratify.squeeze(1)
            elif len(stratify.shape) > 2:
                raise Exception(
                    f"Dimension of dataset {args.stratify_key} cannot be more than two"
                )

        if stratify is None or len(stratify.shape) == 1:
            from sklearn.model_selection import train_test_split
            ix_train, ix_test = train_test_split(range(len(dataset)),
                                                 test_size=test_size,
                                                 random_state=args.random_seed,
                                                 stratify=stratify)
        else:
            from skmultilearn.model_selection import iterative_train_test_split
            # set random seeds manually
            import random
            random.seed(args.random_seed)
            np.random.seed(args.random_seed)
            ix_train, y_train, ix_test, y_test = iterative_train_test_split(
                np.arange(len(dataset), dtype=np.uint32).reshape(-1, 1),
                stratify,
                test_size=test_size)

        dstrain = SubsetDataset(dataset, ix_train)
        dstest = SubsetDataset(dataset, ix_test)

        write_dataset_h5(dstrain, args.train_output, key=keys)
        with h5py.File(args.train_output, 'a') as f:
            self._stamp_dataset(f[keys[0]], args)
        if args.copy_other_keys:
            self.copy_other_keys(args.input, args.train_output, keys)

        write_dataset_h5(dstest, args.test_output, key=keys)
        with h5py.File(args.test_output, 'a') as f:
            self._stamp_dataset(f[keys[0]], args)
        if args.copy_other_keys:
            self.copy_other_keys(args.input, args.test_output, keys)