def oecd_choose(title, text, ref): if ref == 'key': value = clean_text(title.lower().strip(), ensure_word=True) elif ref == 'value': value = clean_text(text.lower().strip(), ensure_word=True) elif ref == 'comb': value = clean_text(title.lower().strip() + '. ' + text.lower().strip(), ensure_word=True) return value
def match_oecd_syn(oecd_syn_dict, oecd_clean): """Match training data to harmonized functional use. Matches harmonized functional uses in a traning set to ones used for training. This is useful since different training sets have some variations in capitolization/cleaning. """ temp_list = [] if isinstance(oecd_syn_dict, dict): to_iter = oecd_syn_dict.items() isdict = True else: to_iter = oecd_syn_dict.iterrows() isdict = False for key1, val1 in to_iter: rawchem = np.nan if isdict: key = key1 val = val1 else: key = val1['harmonized_funcuse'] val = val1['report_funcuse'] if 'raw_chem_name' in val1: rawchem = val1['raw_chem_name'] clean_key = clean_text(key.lower().strip(), ensure_word=True) df_temp = pd.DataFrame( columns=['report_funcuse', 'harmonized_funcuse', 'raw_chem_name']) new_fu = None try: new_fu = oecd_clean[clean_key] except KeyError: fuzzy_match = process.extractBests(clean_key, list(oecd_clean.keys()), limit=2, scorer=fuzz.token_set_ratio) if fuzzy_match[0][1] - fuzzy_match[1][1] > 10: new_fu = oecd_clean[fuzzy_match[0][0]] print(f'Matched {key} to {new_fu}') else: try: new_fu = manual_fix[key] except KeyError: print(f'Could not match functional use: {key}') else: print(f'Manually matched {key} to {new_fu}') if new_fu is not None: df_temp['report_funcuse'] = (val + [key]) if isdict else val df_temp['harmonized_funcuse'] = new_fu df_temp['raw_chem_name'] = rawchem df_temp = df_temp.drop_duplicates() temp_list.append(df_temp) df_comb_temp = pd.concat(temp_list).reset_index(drop=True) return df_comb_temp.drop_duplicates() if isdict else df_comb_temp
def get_default_set(): """Make a training set with the default OECD names and synonyms.""" # make df of harmonized uses for training df = pd.DataFrame( columns=['report_funcuse', 'harmonized_funcuse', 'raw_chem_name']) df['report_funcuse'] = oecd_def.keys() df['harmonized_funcuse'] = oecd_def.keys() oecd_clean = { clean_text(key.lower().strip(), ensure_word=True): key for key in oecd_def.keys() } # add functional uses from oecd_ont ont_match = match_oecd_syn(oecd_ont, oecd_clean) maps_match = match_oecd_syn(maps, oecd_clean) df_comb = pd.concat([df, ont_match, maps_match]) \ .drop_duplicates().reset_index(drop=True) return df_comb
def format_training_set(df1): """Format training set.""" # in this dataset, there are sometimes multiple assigned harmonized uses # this splits them up when they couldn't be matched by the other function temp = [] for name, row in df1.iterrows(): n1 = [row['report_funcuse']] split_harm = split_funcuse(row['harmonized_funcuse']) for n2 in split_harm: new_series = {'report_funcuse': n1, 'harmonized_funcuse': n2} if 'raw_chem_name' in row: new_series['raw_chem_name'] = row['raw_chem_name'] s = pd.Series(new_series) temp.append(s) df1_split = pd.concat(temp, axis=1).T # send to cleaning function oecd_clean = { clean_text(key.lower().strip(), ensure_word=True): key for key in oecd_def.keys() } df1_fixed = match_oecd_syn(df1_split, oecd_clean) return df1_fixed
def match_lists(rep, harm, row, chems, same=None): """Match values on lists.""" if same is None: if len(rep) == len(harm): same = True else: same = False rem = [] new_rep = [] new_harm = [] bad = False harm_clean = [ clean_text(i.lower().strip(), ensure_word=True) for i in harm ] harm_d = {harm_clean[n]: i for n, i in enumerate(harm)} no_use_rep = [] no_use_harm = [] for i in rep: if same: match_list = [ j for n, j in enumerate(harm_clean) if n not in rem ] else: match_list = harm_clean clean_i = clean_text(i.lower().strip(), ensure_word=True) if clean_i == '': continue if not same: map_match = df_default.loc[ (df_default['report_funcuse'] == i) | (df_default['report_funcuse'] == clean_i), 'harmonized_funcuse'].to_list() new_l = [i for i in (harm + harm_clean) if i in map_match] if len(new_l) > 0: new_val = [i for i in map_match if i in new_l][0] new_rep.append(i) new_harm.append(new_val) continue if clean_i == '': print(f'------- Empty string: {row.name} -------') fuzzy_match = process.extractBests(clean_i, match_list, limit=2, scorer=fuzz.token_set_ratio) qual = len(fuzzy_match) == 1 or \ fuzzy_match[0][1] - fuzzy_match[1][1] > 10 if not same and fuzzy_match[0][1] < 50: qual = False if qual: new_rep.append(i) new_harm.append(harm_d[fuzzy_match[0][0]]) rem_val = [ n for n, j in enumerate(harm_clean) if j == fuzzy_match[0][0] and n not in rem ] if same: rem.append(rem_val[0]) else: if len(rem_val) > 0: rem.append(rem_val[0]) elif same: bad = True break else: no_use_rep.append(i) if len(new_harm) == 0 or len(new_rep) == 0: bad = True if bad: if same: s_list = match_lists(rep, harm, row, chems, same=False) else: s_list = [do_nothing(rep, harm, row, chems)] else: s_list = [] for n, i in enumerate(new_rep): new_d = { 'report_funcuse': i, 'harmonized_funcuse': new_harm[n] } if chems: new_d['raw_chem_name'] = row['raw_chem_name'] new_s = pd.Series(new_d) s_list.append(new_s) if len(new_harm) < len(harm) and len(new_rep) < len(rep): no_use_harm = [i for n, i in enumerate(harm) if n not in rem] s_list.append(do_nothing(no_use_rep, no_use_harm, row, chems)) print(f'------- Row {row.name} -------\n' + 'Added: ' + '|'.join(no_use_rep) + ' -> ' + '|'.join(no_use_harm)) if not same and not bad: no_harm = ', '.join( [i for i in harm if i not in (new_harm + no_use_harm)]) no_rep = ', '.join( [i for i in rep if i not in (new_rep + no_use_rep)]) if len(no_harm) > 0 or len(no_rep) > 0: print(f'------- Row {row.name} -------\n' + f'Removed from report_funcuse: {no_rep}\n' + f'Removed from harmonized_funcuse: {no_harm}') return s_list
def train(): for dataset, dataset_name in zip(datasets, datasets_names): j = 0 texts, labels = dataset() texts, labels = clean_text(texts, labels) print('TAMANHO DO DATASET BRUTO:', len(labels)) texts, labels = remove_duplicates(texts, labels) print('TAMANHO DO DATASET REMOVENDO REPETIÇÕES:', len(labels)) texts, labels = under_sampling(texts, labels) print('TAMANHO DO DATASET FINAL:', len(texts)) tokenizer = create_tokenizer(texts) length = max_length(texts) vocab_size = len(tokenizer.word_index) + 1 tweets = encode_text(tokenizer, texts, length) for model_, model_name in zip(models, models_names): print('\nTAMANHO:', length) print('TAMANHO DO VOCABULARIO:', vocab_size) k_fold = 0 sss = StratifiedShuffleSplit(n_splits=3, random_state=42, test_size=0.2) labels = np.array([int(i) for i in labels]) for train_index, test_index in sss.split(tweets, labels): x_train, x_test = tweets[train_index], tweets[test_index] y_train, y_test = labels[train_index], labels[test_index] path = './models/' + dataset_name + '/' if not os.path.exists(path): os.makedirs(path) model = model_(length, vocab_size) check = ModelCheckpoint(path + dataset_name + model_name + str(k_fold) + '_model.h5', monitor='val_loss', save_best_only=True) stop = EarlyStopping(monitor='val_loss', patience=5) plot_model(model, to_file=path + model_name + 'model.png', show_shapes=True) print(model.summary()) model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy']) class_weight_list = compute_class_weight( 'balanced', np.unique(y_train), y_train) class_weight = dict(zip(np.unique(y_train), class_weight_list)) print(class_weight) callbacks = [check, stop] try: h = model.fit([x_train, x_train, x_train], y_train, epochs=epochs, batch_size=batch_size, validation_data=([x_test, x_test, x_test], y_test), callbacks=callbacks, class_weight=class_weight, verbose=1) except: h = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_test, y_test), callbacks=callbacks, class_weight=class_weight, verbose=1) del model model = load_model(path + dataset_name + model_name + str(k_fold) + '_model.h5') try: y_pred = model.predict(x_test) except: y_pred = model.predict([x_test, x_test, x_test]) for threshold in [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7]: y_pred_ = [1 if y > threshold else 0 for y in y_pred] print('thresh', threshold) print(accuracy_score(y_test, y_pred_)) log_path = path + 'log.csv' to_save = { 'arc': model_name, 'fold': k_fold, 'acc': accuracy_score(y_test, y_pred_), 'prec': precision_score(y_test, y_pred_), 'rec': recall_score(y_test, y_pred_), 'f1': f1_score(y_test, y_pred_), 'dataset': dataset_name, 'thresh': threshold } df = pd.DataFrame([to_save]) if k_fold == 0 and j == 0: with open(log_path, 'w') as f: df.to_csv(f, header=True) else: with open(log_path, 'a') as f: df.to_csv(f, header=False) j = j + 1 df_ = pd.read_csv(log_path, index_col=[0]) print(model_name) print(df_[df_['arc'] == model_name].acc.mean()) print(df_[df_['arc'] == model_name].acc.std()) k_fold = k_fold + 1
def transform_text(texts): texts, _ = clean_text(texts, labels=None) tokenizer = create_tokenizer(texts, load=True) tweets = encode_text(tokenizer, texts, 26) return tweets