Ejemplo n.º 1
0
def run_tune_umap(PROJECT_NAME, N_NEIGHBORS=None, MIN_DIST=None, **kwargs):
    """
    """
    dirs_creation([f'results\\umap_tune\\{PROJECT_NAME}'], wipe_dir=True)
    TARGET_DECODER = pd.read_pickle(
        f'results\\objects\\{PROJECT_NAME}\\target_decoder.pkl')
    if N_NEIGHBORS is None:
        N_NEIGHBORS = [5, 15, 45, 135, 400]
    if MIN_DIST is None:
        MIN_DIST = [0.01, 0.1, 0.3, 0.5, 0.7, 0.9]

    targets_themes = pd.read_pickle(
        f'results\\objects\\{PROJECT_NAME}\\themes.pkl')
    parameters_combination = list(product(N_NEIGHBORS, MIN_DIST))

    model = load_model(f'results\\models\\{PROJECT_NAME}')

    encodes, targets, colors = embedding_extraction(
        model=model,
        project_name=PROJECT_NAME,
        target_decoder=TARGET_DECODER,
        colors_bins=20,
        extraction_point='features_extractor')

    UMAP_tuning(array=encodes,
                targets=targets,
                colors=colors,
                parameters_combination=parameters_combination,
                targets_themes=targets_themes,
                figsize=(10, 10),
                n_components=2,
                project_name=PROJECT_NAME,
                embed_targets=True,
                **kwargs)
Ejemplo n.º 2
0
def run_dimensionality_reduction(PROJECT_NAME,
                                 LOCAL_N_NEIGH=5,
                                 LOCAL_MIN_DIST=0.2,
                                 GLOBAL_N_NEIGH=5,
                                 GLOBAL_MIN_DIST=0.2,
                                 **kwargs):
    """
    """
    TARGET_DECODER = pd.read_pickle(
        f'results\\objects\\{PROJECT_NAME}\\target_decoder.pkl')

    model = load_model(f'results\\models\\{PROJECT_NAME}')

    encodes, targets_array, colors = embedding_extraction(
        model=model,
        project_name=PROJECT_NAME,
        target_decoder=TARGET_DECODER,
        colors_bins=20,
        extraction_point='features_extractor')

    for dims in [2, 3]:

        root = f'results\\objects\\{PROJECT_NAME}\\{dims}D'
        dirs_creation([root], wipe_dir=True)

        print(f'Reduction for {dims}D galaxy.')
        reduction = UMAP(n_components=dims,
                         n_neighbors=GLOBAL_N_NEIGH,
                         min_dist=GLOBAL_MIN_DIST,
                         **kwargs).fit_transform(encodes)

        np.save(f'results\\objects\\{PROJECT_NAME}\\{dims}D\\galaxy',
                reduction)

        for unique_target in np.unique(targets_array):

            index = np.argwhere(targets_array == unique_target).flatten()

            print(f'Reduction for {dims}D {unique_target}.')
            reduction = UMAP(n_components=dims,
                             n_neighbors=LOCAL_N_NEIGH,
                             min_dist=LOCAL_MIN_DIST,
                             verbose=True,
                             n_epochs=1000).fit_transform(encodes[index])

            np.save(
                f'results\\objects\\{PROJECT_NAME}\\{dims}D\\{unique_target}',
                reduction)
Ejemplo n.º 3
0
def run_data_preprocessing(PROJECT_NAME, PROJECT_THEMES):
    """
    """
    dirs_creation([
        f'data\\preprocessed\\{PROJECT_NAME}',
        f'results\\objects\\{PROJECT_NAME}'
    ],
                  wipe_dir=True)

    df = pdf_plumbering('data\\raw', PROJECT_NAME)

    sentences = list(df['sentences'].values)
    targets = df['document'].values

    dirs_creation([
        f'data\\preprocessed\\{PROJECT_NAME}\\inputs',
        f'data\\preprocessed\\{PROJECT_NAME}\\targets'
    ],
                  wipe_dir=True)

    sentence_encoder, sentence_decoder, target_encoder, \
        target_decoder = preprocessing(
            list_sentences=sentences,
            targets=targets,
            project_id=PROJECT_NAME,
            max_len=1000,
            max_batch=64
         )

    dump_pickle(objs=[
        sentence_encoder, sentence_decoder, target_encoder, target_decoder,
        PROJECT_THEMES
    ],
                paths=[f'results\\objects\\{PROJECT_NAME}'] * 5,
                filenames=[
                    'sentence_encoder', 'sentence_decoder', 'target_encoder',
                    'target_decoder', 'themes'
                ])
Ejemplo n.º 4
0
def UMAP_tuning(array,
                targets,
                colors,
                parameters_combination,
                targets_themes,
                figsize,
                project_name,
                n_components=2,
                embed_targets=False,
                **kwargs):
    """
    """
    root = f'results\\umap_tune\\{project_name}'
    dirs_creation(
        [f'{root}\\{para[0]}_{para[1]}' for para in parameters_combination],
        wipe_dir=True)
    plt.style.use('dark_background')
    for parameters in tqdm(parameters_combination):

        reduction = UMAP_fitting(array=array,
                                 n_components=2,
                                 n_neighbors=parameters[0],
                                 min_dist=parameters[1],
                                 **kwargs)

        if embed_targets:
            for target, theme in targets_themes.items():

                index = np.argwhere(targets == target).flatten()
                target_reduction = UMAP_fitting(array=array[index],
                                                n_components=2,
                                                n_neighbors=parameters[0],
                                                min_dist=parameters[1],
                                                **kwargs)
                fig_target, ax_target = plt.subplots(figsize=(10, 10))
                ax_target.scatter(target_reduction[:, 0],
                                  target_reduction[:, 1],
                                  s=0.25,
                                  c=colors[index],
                                  cmap=theme,
                                  edgecolor='',
                                  marker='o')
                ax_target.axis('off')

                ax_target.text(0.5,
                               1,
                               target.upper(),
                               horizontalalignment='center',
                               verticalalignment='center',
                               transform=ax_target.transAxes,
                               fontname='Microsoft Yi Baiti',
                               size=20,
                               weight='bold')

                fig_target.savefig(
                    f'{root}\\{parameters[0]}_{parameters[1]}\\{target}_e.png',
                    dpi=400)

        fig_main, ax_main = plt.subplots(figsize=figsize)
        for target, theme in targets_themes.items():

            index = np.argwhere(targets == target).flatten()
            fig_sub, ax_sub = plt.subplots(figsize=(10, 10))
            ax_sub.scatter(reduction[:, 0][index],
                           reduction[:, 1][index],
                           s=0.25,
                           c=colors[index],
                           cmap=theme,
                           edgecolor='',
                           marker='o')
            ax_sub.axis('off')

            ax_sub.text(0.5,
                        1,
                        target.upper(),
                        horizontalalignment='center',
                        verticalalignment='center',
                        transform=ax_sub.transAxes,
                        fontname='Microsoft Yi Baiti',
                        size=20,
                        weight='bold')
            fig_sub.savefig(
                f'{root}\\{parameters[0]}_{parameters[1]}\\{target}.png',
                dpi=400)

            ax_main.scatter(reduction[:, 0][index],
                            reduction[:, 1][index],
                            s=0.25,
                            c=colors[index],
                            cmap=theme,
                            edgecolor='',
                            marker='o')

            ax_main.axis('off')

        ax_main.text(0.5,
                     1,
                     'BOOKS GALAXY',
                     horizontalalignment='center',
                     verticalalignment='center',
                     transform=ax_main.transAxes,
                     fontname='Microsoft Yi Baiti',
                     size=20,
                     weight='bold')
        fig_main.savefig(
            f'{root}\\{parameters[0]}_{parameters[1]}\\galaxy.png', dpi=400)
        plt.close('all')
Ejemplo n.º 5
0
import pandas as pd

from modules.utils.data_utils import preprocessing
from modules.utils.general_utils import dirs_creation, dump_pickle

DIRS = ['data\\inputs', 'data\\targets', 'results\\objects']
FRAC = 1.0

dirs_creation(
    dirs=DIRS,
    wipe_dir=True,
)

df = pd.read_csv('data\\csv\\cleaned\\airline_twitter.csv')
df = df.sample(frac=FRAC).reset_index(drop=True)
list_sentences = list(df['tweet'].values)
sentiments = df['sentiment'].map({
    'neutral': 0,
    'positive': 1,
    'negative': 2
}).values

encoder, decoder = preprocessing(list_sentences, sentiments, max_len=1000)

dump_pickle(objs=[encoder, decoder],
            paths=['results\\objects'] * 2,
            filenames=['encoder', 'decoder'])
Ejemplo n.º 6
0
from modules.utils.general_utils import dirs_creation

###############################################################

DECODER = pd.read_pickle('results\\objects\\decoder.pkl')

BTCH = [i for i in range(len(os.listdir('data\\inputs')))]
BTCH = np.random.choice(BTCH, len(BTCH), replace=False)

TR_BTCH = BTCH[: int(len(BTCH) * 0.8)]
TS_BTCH = BTCH[int(len(BTCH) * 0.8):]

DIRS = ['results\\models']

dirs_creation(
    dirs=DIRS,
    wipe_dir=False,
)

###############################################################

stopper = es(
    min_delta=0.0001,
    patience=5,
    monitor='val_loss',
    restore_best_weights=True
)
hp_dict = {
    'embedding_units': 250,
    'lstm_units': 100,
    'dense_units': 100,
    'dropout': 0.2