Ejemplo n.º 1
0
 def oecd_choose(title, text, ref):
     if ref == 'key':
         value = clean_text(title.lower().strip(), ensure_word=True)
     elif ref == 'value':
         value = clean_text(text.lower().strip(), ensure_word=True)
     elif ref == 'comb':
         value = clean_text(title.lower().strip() + '. ' +
                            text.lower().strip(),
                            ensure_word=True)
     return value
Ejemplo n.º 2
0
def match_oecd_syn(oecd_syn_dict, oecd_clean):
    """Match training data to harmonized functional use.

    Matches harmonized functional uses in a traning set to ones used for
    training. This is useful since different training sets have some
    variations in capitolization/cleaning.
    """
    temp_list = []
    if isinstance(oecd_syn_dict, dict):
        to_iter = oecd_syn_dict.items()
        isdict = True
    else:
        to_iter = oecd_syn_dict.iterrows()
        isdict = False
    for key1, val1 in to_iter:
        rawchem = np.nan
        if isdict:
            key = key1
            val = val1
        else:
            key = val1['harmonized_funcuse']
            val = val1['report_funcuse']
            if 'raw_chem_name' in val1:
                rawchem = val1['raw_chem_name']
        clean_key = clean_text(key.lower().strip(), ensure_word=True)
        df_temp = pd.DataFrame(
            columns=['report_funcuse', 'harmonized_funcuse', 'raw_chem_name'])
        new_fu = None
        try:
            new_fu = oecd_clean[clean_key]
        except KeyError:
            fuzzy_match = process.extractBests(clean_key,
                                               list(oecd_clean.keys()),
                                               limit=2,
                                               scorer=fuzz.token_set_ratio)
            if fuzzy_match[0][1] - fuzzy_match[1][1] > 10:
                new_fu = oecd_clean[fuzzy_match[0][0]]
                print(f'Matched {key} to {new_fu}')
            else:
                try:
                    new_fu = manual_fix[key]
                except KeyError:
                    print(f'Could not match functional use: {key}')
                else:
                    print(f'Manually matched {key} to {new_fu}')
        if new_fu is not None:
            df_temp['report_funcuse'] = (val + [key]) if isdict else val
            df_temp['harmonized_funcuse'] = new_fu
            df_temp['raw_chem_name'] = rawchem
        df_temp = df_temp.drop_duplicates()
        temp_list.append(df_temp)
    df_comb_temp = pd.concat(temp_list).reset_index(drop=True)
    return df_comb_temp.drop_duplicates() if isdict else df_comb_temp
Ejemplo n.º 3
0
def get_default_set():
    """Make a training set with the default OECD names and synonyms."""
    # make df of harmonized uses for training
    df = pd.DataFrame(
        columns=['report_funcuse', 'harmonized_funcuse', 'raw_chem_name'])
    df['report_funcuse'] = oecd_def.keys()
    df['harmonized_funcuse'] = oecd_def.keys()
    oecd_clean = {
        clean_text(key.lower().strip(), ensure_word=True): key
        for key in oecd_def.keys()
    }

    # add functional uses from oecd_ont
    ont_match = match_oecd_syn(oecd_ont, oecd_clean)
    maps_match = match_oecd_syn(maps, oecd_clean)
    df_comb = pd.concat([df, ont_match, maps_match]) \
        .drop_duplicates().reset_index(drop=True)
    return df_comb
Ejemplo n.º 4
0
def format_training_set(df1):
    """Format training set."""
    # in this dataset, there are sometimes multiple assigned harmonized uses
    # this splits them up when they couldn't be matched by the other function
    temp = []
    for name, row in df1.iterrows():
        n1 = [row['report_funcuse']]
        split_harm = split_funcuse(row['harmonized_funcuse'])
        for n2 in split_harm:
            new_series = {'report_funcuse': n1, 'harmonized_funcuse': n2}
            if 'raw_chem_name' in row:
                new_series['raw_chem_name'] = row['raw_chem_name']
            s = pd.Series(new_series)
            temp.append(s)
    df1_split = pd.concat(temp, axis=1).T

    # send to cleaning function
    oecd_clean = {
        clean_text(key.lower().strip(), ensure_word=True): key
        for key in oecd_def.keys()
    }
    df1_fixed = match_oecd_syn(df1_split, oecd_clean)
    return df1_fixed
Ejemplo n.º 5
0
    def match_lists(rep, harm, row, chems, same=None):
        """Match values on lists."""
        if same is None:
            if len(rep) == len(harm):
                same = True
            else:
                same = False
        rem = []
        new_rep = []
        new_harm = []
        bad = False
        harm_clean = [
            clean_text(i.lower().strip(), ensure_word=True) for i in harm
        ]
        harm_d = {harm_clean[n]: i for n, i in enumerate(harm)}
        no_use_rep = []
        no_use_harm = []
        for i in rep:
            if same:
                match_list = [
                    j for n, j in enumerate(harm_clean) if n not in rem
                ]
            else:
                match_list = harm_clean
            clean_i = clean_text(i.lower().strip(), ensure_word=True)

            if clean_i == '':
                continue

            if not same:
                map_match = df_default.loc[
                    (df_default['report_funcuse'] == i) |
                    (df_default['report_funcuse'] == clean_i),
                    'harmonized_funcuse'].to_list()
                new_l = [i for i in (harm + harm_clean) if i in map_match]
                if len(new_l) > 0:
                    new_val = [i for i in map_match if i in new_l][0]
                    new_rep.append(i)
                    new_harm.append(new_val)
                    continue
            if clean_i == '':
                print(f'------- Empty string: {row.name} -------')
            fuzzy_match = process.extractBests(clean_i,
                                               match_list,
                                               limit=2,
                                               scorer=fuzz.token_set_ratio)
            qual = len(fuzzy_match) == 1 or \
                fuzzy_match[0][1] - fuzzy_match[1][1] > 10
            if not same and fuzzy_match[0][1] < 50:
                qual = False
            if qual:
                new_rep.append(i)
                new_harm.append(harm_d[fuzzy_match[0][0]])
                rem_val = [
                    n for n, j in enumerate(harm_clean)
                    if j == fuzzy_match[0][0] and n not in rem
                ]
                if same:
                    rem.append(rem_val[0])
                else:
                    if len(rem_val) > 0:
                        rem.append(rem_val[0])
            elif same:
                bad = True
                break
            else:
                no_use_rep.append(i)

        if len(new_harm) == 0 or len(new_rep) == 0:
            bad = True

        if bad:
            if same:
                s_list = match_lists(rep, harm, row, chems, same=False)
            else:
                s_list = [do_nothing(rep, harm, row, chems)]
        else:
            s_list = []
            for n, i in enumerate(new_rep):
                new_d = {
                    'report_funcuse': i,
                    'harmonized_funcuse': new_harm[n]
                }
                if chems:
                    new_d['raw_chem_name'] = row['raw_chem_name']
                new_s = pd.Series(new_d)
                s_list.append(new_s)
            if len(new_harm) < len(harm) and len(new_rep) < len(rep):
                no_use_harm = [i for n, i in enumerate(harm) if n not in rem]
                s_list.append(do_nothing(no_use_rep, no_use_harm, row, chems))
                print(f'------- Row {row.name} -------\n' + 'Added: ' +
                      '|'.join(no_use_rep) + ' -> ' + '|'.join(no_use_harm))

        if not same and not bad:
            no_harm = ', '.join(
                [i for i in harm if i not in (new_harm + no_use_harm)])
            no_rep = ', '.join(
                [i for i in rep if i not in (new_rep + no_use_rep)])
            if len(no_harm) > 0 or len(no_rep) > 0:
                print(f'------- Row {row.name} -------\n' +
                      f'Removed from report_funcuse: {no_rep}\n' +
                      f'Removed from harmonized_funcuse: {no_harm}')
        return s_list
def train():

    for dataset, dataset_name in zip(datasets, datasets_names):
        j = 0
        texts, labels = dataset()
        texts, labels = clean_text(texts, labels)
        print('TAMANHO DO DATASET BRUTO:', len(labels))
        texts, labels = remove_duplicates(texts, labels)
        print('TAMANHO DO DATASET REMOVENDO REPETIÇÕES:', len(labels))
        texts, labels = under_sampling(texts, labels)
        print('TAMANHO DO DATASET FINAL:', len(texts))
        tokenizer = create_tokenizer(texts)
        length = max_length(texts)
        vocab_size = len(tokenizer.word_index) + 1
        tweets = encode_text(tokenizer, texts, length)

        for model_, model_name in zip(models, models_names):
            print('\nTAMANHO:', length)
            print('TAMANHO DO VOCABULARIO:', vocab_size)

            k_fold = 0
            sss = StratifiedShuffleSplit(n_splits=3,
                                         random_state=42,
                                         test_size=0.2)
            labels = np.array([int(i) for i in labels])

            for train_index, test_index in sss.split(tweets, labels):
                x_train, x_test = tweets[train_index], tweets[test_index]
                y_train, y_test = labels[train_index], labels[test_index]

                path = './models/' + dataset_name + '/'
                if not os.path.exists(path): os.makedirs(path)

                model = model_(length, vocab_size)
                check = ModelCheckpoint(path + dataset_name + model_name +
                                        str(k_fold) + '_model.h5',
                                        monitor='val_loss',
                                        save_best_only=True)
                stop = EarlyStopping(monitor='val_loss', patience=5)
                plot_model(model,
                           to_file=path + model_name + 'model.png',
                           show_shapes=True)
                print(model.summary())
                model.compile(loss='binary_crossentropy',
                              optimizer='adadelta',
                              metrics=['accuracy'])

                class_weight_list = compute_class_weight(
                    'balanced', np.unique(y_train), y_train)
                class_weight = dict(zip(np.unique(y_train), class_weight_list))
                print(class_weight)
                callbacks = [check, stop]

                try:
                    h = model.fit([x_train, x_train, x_train],
                                  y_train,
                                  epochs=epochs,
                                  batch_size=batch_size,
                                  validation_data=([x_test, x_test,
                                                    x_test], y_test),
                                  callbacks=callbacks,
                                  class_weight=class_weight,
                                  verbose=1)
                except:
                    h = model.fit(x_train,
                                  y_train,
                                  epochs=epochs,
                                  batch_size=batch_size,
                                  validation_data=(x_test, y_test),
                                  callbacks=callbacks,
                                  class_weight=class_weight,
                                  verbose=1)

                del model
                model = load_model(path + dataset_name + model_name +
                                   str(k_fold) + '_model.h5')

                try:
                    y_pred = model.predict(x_test)
                except:
                    y_pred = model.predict([x_test, x_test, x_test])

                for threshold in [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7]:
                    y_pred_ = [1 if y > threshold else 0 for y in y_pred]
                    print('thresh', threshold)
                    print(accuracy_score(y_test, y_pred_))

                    log_path = path + 'log.csv'
                    to_save = {
                        'arc': model_name,
                        'fold': k_fold,
                        'acc': accuracy_score(y_test, y_pred_),
                        'prec': precision_score(y_test, y_pred_),
                        'rec': recall_score(y_test, y_pred_),
                        'f1': f1_score(y_test, y_pred_),
                        'dataset': dataset_name,
                        'thresh': threshold
                    }

                    df = pd.DataFrame([to_save])

                    if k_fold == 0 and j == 0:
                        with open(log_path, 'w') as f:
                            df.to_csv(f, header=True)
                    else:
                        with open(log_path, 'a') as f:
                            df.to_csv(f, header=False)
                    j = j + 1
                    df_ = pd.read_csv(log_path, index_col=[0])
                    print(model_name)
                    print(df_[df_['arc'] == model_name].acc.mean())
                    print(df_[df_['arc'] == model_name].acc.std())

                k_fold = k_fold + 1
def transform_text(texts):
    texts, _ = clean_text(texts, labels=None)
    tokenizer = create_tokenizer(texts, load=True)
    tweets = encode_text(tokenizer, texts, 26)
    return tweets