def getEncoder(path: str) -> MultiLabelBinarizer: """Read the multi-label encoder used for the dataset from the given path. Args: path (str): Path to the persisted encoder classes Returns: MultiLabelBinarizer: Encoder """ encoder = MultiLabelBinarizer() encoder.classes_ = np.load(path, allow_pickle=True) return encoder
def load_dataset(data_setting, batch_size, split): data = pd.read_csv(f'{GENERATED_DIR}/{split}_{data_setting}.csv', dtype={'LENGTH': int}) len_stat = data['LENGTH'].describe() logging.info(f'{split} set length stats:\n{len_stat}') if data_setting == FULL: code_df = pd.read_csv(f'{CODE_FREQ_PATH}', dtype={'code': str}) all_codes = ';'.join(map(str, code_df['code'].values.tolist())) data = data.append( { 'HADM_ID': -1, 'TEXT': 'remove', 'LABELS': all_codes, 'length': 6 }, ignore_index=True) mlb = MultiLabelBinarizer() data['LABELS'] = data['LABELS'].apply(lambda x: str(x).split(';')) code_counts = list(data['LABELS'].str.len()) avg_code_counts = sum(code_counts) / len(code_counts) logging.info( f'In {split} set, average code counts per discharge summary: {avg_code_counts}' ) mlb.fit(data['LABELS']) temp = mlb.transform(data['LABELS']) if mlb.classes_[-1] == 'nan': mlb.classes_ = mlb.classes_[:-1] logging.info(f'Final number of labels/codes: {len(mlb.classes_)}') for i, x in enumerate(mlb.classes_): data[x] = temp[:, i] data.drop(['LABELS', 'LENGTH'], axis=1, inplace=True) if data_setting == FULL: data = data[:-1] code_list = list(mlb.classes_) label_freq = list(data[code_list].sum(axis=0)) hadm_ids = data['HADM_ID'].values.tolist() texts = data['TEXT'].values.tolist() labels = data[code_list].values.tolist() item_count = (len(texts) // batch_size) * batch_size logging.info(f'{split} set true item count: {item_count}\n\n') return { 'hadm_ids': hadm_ids[:item_count], 'texts': texts[:item_count], 'targets': labels[:item_count], 'labels': code_list, 'label_freq': label_freq }
def _set_player_ids(self, df): " Assign id to each player api_id " mlb_all_players = MultiLabelBinarizer() all_players = df[self.cols_home + self.cols_away].values mlb_all_players.fit_transform(all_players) if -1 not in mlb_all_players.classes_: mlb_all_players.classes_ = np.append(mlb_all_players.classes_, -1) assert mlb_all_players.classes_[-1] == -1 self.players = mlb_all_players.classes_ self.n_players = len(self.players) self.api_id2idx = dict( (api, idx) for api, idx in zip(self.players, np.arange(self.n_players))) self.idx2api_id = dict( (api, idx) for api, idx in zip(np.arange(self.n_players), self.players))
def evaluate(scores, labels, CLASSES): """ Evaluates the predicted classes w.r.t. a gold file. """ vocab_map = dict([(i, v) for i, v in enumerate(CLASSES)]) mlb = MultiLabelBinarizer() mlb.fit([CLASSES]) # Hack to maintain order mlb.classes_ = np.array(CLASSES) gold_label = mlb.transform(labels.tolist()) pred_score = np.matrix(scores.tolist()) pred_label = (pred_score > 0.5).astype(int) roc_auc = roc_auc_score(gold_label, pred_score, average="micro", multi_class="ovr") f1 = f1_score(gold_label, pred_label, average="micro") return f1, roc_auc
self._tokenizer = None def create_tokenizer(self, text_list): tokenizer = text.Tokenizer(num_words=self._vocab_size) tokenizer.fit_on_texts(text_list) self._tokenizer = tokenizer def transform_text(self, text_list): text_matrix = self._tokenizer.texts_to_matrix(text_list) return text_matrix tag_encoder = MultiLabelBinarizer() tag_encoder.classes_ = [ 'anger', 'fear', 'happiness', 'happy', 'love', 'neutral', 'relief', 'sadness', 'surprise', 'worry' ] def predict_emotion(text_requests): classifier = CustomModelPrediction.from_path('.') # print("----------------------------------------------") # print(classifier) results = classifier.predict(text_requests) emotion = [] for i in range(len(results)): for idx, val in enumerate(results[i]): if val > 0.1: emotion.append([val, tag_encoder.classes_[idx]]) emotion.sort(reverse=True)
# comment in flair embeddings for state-of-the-art results # FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), ] # document_embeddings: DocumentRNNEmbeddings = DocumentMeanEmbeddings(word_embeddings) document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=32, reproject_words=True, reproject_words_dimension=word_embeddings[0].embedding_length, ) label_encoder = MultiLabelBinarizer() label_encoder.classes_ = np.array([l.decode('utf8') for l, i in sorted(corpus.make_label_dictionary().item2idx.items(), key=lambda x: x[1])]) def score_fun(train_data,test_data): clf = TextClassifierProba(document_embeddings, label_dictionary=label_dict, multi_label=False) trainer = ModelTrainer(clf, corpus,torch.optim.RMSprop) base_path = 'flair_resources/text_clf/20newsgroups' print('start training') trainer.train(base_path, learning_rate=0.01, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=20) y_train = label_encoder.transform(get_targets(train_data)) y_test = label_encoder.transform(get_targets(test_data))
import pickle import numpy as np from sklearn.preprocessing import MultiLabelBinarizer import os this_dir, this_filename = os.path.split(__file__) DATA_PATH = os.path.join(this_dir, "data", "data.txt") INPUT_ENCODER = MultiLabelBinarizer() OUTPUT_ENCODER = MultiLabelBinarizer() TAGGER_MODEL = pickle.load( open(os.path.join(this_dir, "model", "DateTimeNER.mdl"), "rb")) DATES_MDL = pickle.load( open(os.path.join(this_dir, "model", "MULTI_TARGET_FOREST.mdl"), "rb")) INPUT_ENCODER.classes_ = np.load( os.path.join(this_dir, "model", "INPUT_ENCODER.npy")) OUTPUT_ENCODER.classes_ = np.load( os.path.join(this_dir, "model", "OUTPUT_ENCODER.npy")) class TaggerModel(object): def __init__(self, features): self.features = features def predict(self): print(TAGGER_MODEL.predict(self.features)) return TAGGER_MODEL.predict(self.features)[0] class DatesSettingsModel(object): def __init__(self, patterns=[]):
def fit_binariser(): mlb = MultiLabelBinarizer(labels) dtype = np.int if all(isinstance(c, int) for c in mlb.classes) else object mlb.classes_ = np.empty(len(mlb.classes), dtype=dtype) mlb.classes_[:] = mlb.classes return mlb
'roerbakken/wokken', 'vooraf te maken', 'frituren', 'kerst', 'stoven', 'midden-oosters', 'zonder vlees/vis', 'japans', 'japanse', 'budget', 'engels', 'engelse', 'gezond', 'oud nieuw', 'pasen', 'zonder vlees', 'gourmet' ] df.recipe_instruction = df.recipe_instruction.fillna('') df_tags_matrix[ 'text'] = df.title + ' ' + df.description + ' ' + df.recipe_instruction df_tags_matrix['text_without_stopwords'] = df_tags_matrix['text'].apply( lambda x: ' '.join( [word for word in x.split() if word.lower() not in (stop_words)])) df_tags_matrix = df_tags_matrix.dropna() mlb.classes_ = df_tags_matrix.columns[:-1] X = df_tags_matrix['text_without_stopwords'] y = df_tags_matrix.drop(['text', 'text_without_stopwords'], axis=1).as_matrix() print(X.shape, y.shape) # shuffle and make training and test set X_shuf, y_shuf = shuffle(X, y) X_train, X_test, y_train, y_test = train_test_split(X_shuf, y_shuf, test_size=0.3, random_state=1) print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) pipeline = Pipeline([('tfidf', TfidfVectorizer(min_df=5, max_features=10000)), ('clf', OneVsRestClassifier(
del hosp, diag gc.collect() if train_models['hglm'] or train_models['lasso'] or read_ccs: ccs = pd.read_csv(datadir + 'ccs.csv', index_col=0, header=None, dtype='Int64').values ccs_train = ccs[idx_train] ccs_val = ccs[idx_val] ccs_test = ccs[idx_test] #elements for one-hot encoding enc1 = MultiLabelBinarizer(sparse_output=True) enc1.fit(ccs) enc1.classes_ = enc1.classes_[~np.isnan(enc1.classes_.astype(float))] X_diag_train = enc1.transform(ccs_train) X_diag_val = enc1.transform(ccs_val) X_diag_test = enc1.transform(ccs_test) enc2 = OneHotEncoder(sparse=True) X_cohort_test = enc2.fit_transform(cohort_test.astype(str).reshape(-1, 1)) X_cohort_val = enc2.fit_transform(cohort_val.astype(str).reshape(-1, 1)) X_cohort_train = enc2.fit_transform( cohort_train.astype(str).reshape(-1, 1)) del ccs hosp_test = hosp_test.todense() num_hosp = len(np.unique(hosp_id_test))