def getEncoder(path: str) -> MultiLabelBinarizer:
    """Read the multi-label encoder used for the dataset from the given path.

    Args:
        path (str): Path to the persisted encoder classes

    Returns:
        MultiLabelBinarizer: Encoder
    """
    encoder = MultiLabelBinarizer()
    encoder.classes_ = np.load(path, allow_pickle=True)
    return encoder
Beispiel #2
0
def load_dataset(data_setting, batch_size, split):
    data = pd.read_csv(f'{GENERATED_DIR}/{split}_{data_setting}.csv',
                       dtype={'LENGTH': int})
    len_stat = data['LENGTH'].describe()
    logging.info(f'{split} set length stats:\n{len_stat}')

    if data_setting == FULL:
        code_df = pd.read_csv(f'{CODE_FREQ_PATH}', dtype={'code': str})
        all_codes = ';'.join(map(str, code_df['code'].values.tolist()))
        data = data.append(
            {
                'HADM_ID': -1,
                'TEXT': 'remove',
                'LABELS': all_codes,
                'length': 6
            },
            ignore_index=True)

    mlb = MultiLabelBinarizer()
    data['LABELS'] = data['LABELS'].apply(lambda x: str(x).split(';'))
    code_counts = list(data['LABELS'].str.len())
    avg_code_counts = sum(code_counts) / len(code_counts)
    logging.info(
        f'In {split} set, average code counts per discharge summary: {avg_code_counts}'
    )
    mlb.fit(data['LABELS'])
    temp = mlb.transform(data['LABELS'])
    if mlb.classes_[-1] == 'nan':
        mlb.classes_ = mlb.classes_[:-1]
    logging.info(f'Final number of labels/codes: {len(mlb.classes_)}')

    for i, x in enumerate(mlb.classes_):
        data[x] = temp[:, i]
    data.drop(['LABELS', 'LENGTH'], axis=1, inplace=True)

    if data_setting == FULL:
        data = data[:-1]

    code_list = list(mlb.classes_)
    label_freq = list(data[code_list].sum(axis=0))
    hadm_ids = data['HADM_ID'].values.tolist()
    texts = data['TEXT'].values.tolist()
    labels = data[code_list].values.tolist()
    item_count = (len(texts) // batch_size) * batch_size
    logging.info(f'{split} set true item count: {item_count}\n\n')
    return {
        'hadm_ids': hadm_ids[:item_count],
        'texts': texts[:item_count],
        'targets': labels[:item_count],
        'labels': code_list,
        'label_freq': label_freq
    }
    def _set_player_ids(self, df):
        " Assign id to each player api_id "
        mlb_all_players = MultiLabelBinarizer()
        all_players = df[self.cols_home + self.cols_away].values
        mlb_all_players.fit_transform(all_players)
        if -1 not in mlb_all_players.classes_:
            mlb_all_players.classes_ = np.append(mlb_all_players.classes_, -1)
            assert mlb_all_players.classes_[-1] == -1

        self.players = mlb_all_players.classes_
        self.n_players = len(self.players)
        self.api_id2idx = dict(
            (api, idx)
            for api, idx in zip(self.players, np.arange(self.n_players)))
        self.idx2api_id = dict(
            (api, idx)
            for api, idx in zip(np.arange(self.n_players), self.players))
def evaluate(scores, labels, CLASSES):
    """
    Evaluates the predicted classes w.r.t. a gold file.
    """

    vocab_map = dict([(i, v) for i, v in enumerate(CLASSES)])

    mlb = MultiLabelBinarizer()
    mlb.fit([CLASSES])
    # Hack to maintain order
    mlb.classes_ = np.array(CLASSES)

    gold_label = mlb.transform(labels.tolist())
    pred_score = np.matrix(scores.tolist())
    pred_label = (pred_score > 0.5).astype(int)
    roc_auc = roc_auc_score(gold_label,
                            pred_score,
                            average="micro",
                            multi_class="ovr")
    f1 = f1_score(gold_label, pred_label, average="micro")
    return f1, roc_auc
Beispiel #5
0
        self._tokenizer = None

    def create_tokenizer(self, text_list):
        tokenizer = text.Tokenizer(num_words=self._vocab_size)
        tokenizer.fit_on_texts(text_list)
        self._tokenizer = tokenizer

    def transform_text(self, text_list):
        text_matrix = self._tokenizer.texts_to_matrix(text_list)
        return text_matrix


tag_encoder = MultiLabelBinarizer()

tag_encoder.classes_ = [
    'anger', 'fear', 'happiness', 'happy', 'love', 'neutral', 'relief',
    'sadness', 'surprise', 'worry'
]


def predict_emotion(text_requests):

    classifier = CustomModelPrediction.from_path('.')
    # print("----------------------------------------------")
    # print(classifier)
    results = classifier.predict(text_requests)
    emotion = []
    for i in range(len(results)):
        for idx, val in enumerate(results[i]):
            if val > 0.1:
                emotion.append([val, tag_encoder.classes_[idx]])
    emotion.sort(reverse=True)
Beispiel #6
0
                       # comment in flair embeddings for state-of-the-art results
                       # FlairEmbeddings('news-forward'),
                       # FlairEmbeddings('news-backward'),
                       ]

    # document_embeddings: DocumentRNNEmbeddings = DocumentMeanEmbeddings(word_embeddings)

    document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                hidden_size=32,
                                                reproject_words=True,
                                                reproject_words_dimension=word_embeddings[0].embedding_length,
                                                )


    label_encoder = MultiLabelBinarizer()
    label_encoder.classes_ = np.array([l.decode('utf8') for l, i in sorted(corpus.make_label_dictionary().item2idx.items(), key=lambda x: x[1])])

    def score_fun(train_data,test_data):
        clf = TextClassifierProba(document_embeddings, label_dictionary=label_dict, multi_label=False)
        trainer = ModelTrainer(clf, corpus,torch.optim.RMSprop)
        base_path = 'flair_resources/text_clf/20newsgroups'
        print('start training')
        trainer.train(base_path,
                      learning_rate=0.01,
                      mini_batch_size=32,
                      anneal_factor=0.5,
                      patience=5,
                      max_epochs=20)

        y_train = label_encoder.transform(get_targets(train_data))
        y_test = label_encoder.transform(get_targets(test_data))
Beispiel #7
0
import pickle
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import os
this_dir, this_filename = os.path.split(__file__)
DATA_PATH = os.path.join(this_dir, "data", "data.txt")

INPUT_ENCODER = MultiLabelBinarizer()
OUTPUT_ENCODER = MultiLabelBinarizer()
TAGGER_MODEL = pickle.load(
    open(os.path.join(this_dir, "model", "DateTimeNER.mdl"), "rb"))
DATES_MDL = pickle.load(
    open(os.path.join(this_dir, "model", "MULTI_TARGET_FOREST.mdl"), "rb"))

INPUT_ENCODER.classes_ = np.load(
    os.path.join(this_dir, "model", "INPUT_ENCODER.npy"))
OUTPUT_ENCODER.classes_ = np.load(
    os.path.join(this_dir, "model", "OUTPUT_ENCODER.npy"))


class TaggerModel(object):
    def __init__(self, features):
        self.features = features

    def predict(self):
        print(TAGGER_MODEL.predict(self.features))
        return TAGGER_MODEL.predict(self.features)[0]


class DatesSettingsModel(object):
    def __init__(self, patterns=[]):
def fit_binariser():
    mlb = MultiLabelBinarizer(labels)
    dtype = np.int if all(isinstance(c, int) for c in mlb.classes) else object
    mlb.classes_ = np.empty(len(mlb.classes), dtype=dtype)
    mlb.classes_[:] = mlb.classes
    return mlb
Beispiel #9
0
    'roerbakken/wokken', 'vooraf te maken', 'frituren', 'kerst', 'stoven',
    'midden-oosters', 'zonder vlees/vis', 'japans', 'japanse', 'budget',
    'engels', 'engelse', 'gezond', 'oud  nieuw', 'pasen', 'zonder vlees',
    'gourmet'
]

df.recipe_instruction = df.recipe_instruction.fillna('')
df_tags_matrix[
    'text'] = df.title + ' ' + df.description + ' ' + df.recipe_instruction

df_tags_matrix['text_without_stopwords'] = df_tags_matrix['text'].apply(
    lambda x: ' '.join(
        [word for word in x.split() if word.lower() not in (stop_words)]))

df_tags_matrix = df_tags_matrix.dropna()
mlb.classes_ = df_tags_matrix.columns[:-1]

X = df_tags_matrix['text_without_stopwords']
y = df_tags_matrix.drop(['text', 'text_without_stopwords'], axis=1).as_matrix()
print(X.shape, y.shape)
# shuffle and make training and test set
X_shuf, y_shuf = shuffle(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_shuf,
                                                    y_shuf,
                                                    test_size=0.3,
                                                    random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

pipeline = Pipeline([('tfidf', TfidfVectorizer(min_df=5, max_features=10000)),
                     ('clf',
                      OneVsRestClassifier(
Beispiel #10
0
del hosp, diag
gc.collect()

if train_models['hglm'] or train_models['lasso'] or read_ccs:
    ccs = pd.read_csv(datadir + 'ccs.csv',
                      index_col=0,
                      header=None,
                      dtype='Int64').values
    ccs_train = ccs[idx_train]
    ccs_val = ccs[idx_val]
    ccs_test = ccs[idx_test]

    #elements for one-hot encoding
    enc1 = MultiLabelBinarizer(sparse_output=True)
    enc1.fit(ccs)
    enc1.classes_ = enc1.classes_[~np.isnan(enc1.classes_.astype(float))]
    X_diag_train = enc1.transform(ccs_train)
    X_diag_val = enc1.transform(ccs_val)
    X_diag_test = enc1.transform(ccs_test)

    enc2 = OneHotEncoder(sparse=True)
    X_cohort_test = enc2.fit_transform(cohort_test.astype(str).reshape(-1, 1))
    X_cohort_val = enc2.fit_transform(cohort_val.astype(str).reshape(-1, 1))
    X_cohort_train = enc2.fit_transform(
        cohort_train.astype(str).reshape(-1, 1))

del ccs

hosp_test = hosp_test.todense()
num_hosp = len(np.unique(hosp_id_test))