コード例 #1
0
from utils import extract_data_hdf5

# keras
from keras.layers import Dense, Dropout, Bidirectional
from keras.layers import GRU, Layer, Input
from keras.layers.embeddings import Embedding
from keras.utils import np_utils
import keras
from keras.models import Model

# sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, f1_score

# parameters for initial configuration
config = data_helper.BaseConfiguration()
config.MAX_SENTS = 10

from collections import Counter
"""data session"""
# check whether the json formatted dataset exist or now
dataset_path = '../preprocessed_data/dataset.pkl'
use_exist = False
if os.path.isfile(dataset_path):
    data_whole = pickle.load(open(dataset_path, 'rb'))
    use_exist = True

    dataset = data_whole['dataset_raw']
    contexts = data_whole['context_raw']
    labels = data_whole['labels']
    codes = data_whole['codes']
コード例 #2
0
def build_dataset(output='./preprocessed_data/dataset.pkl'):
    # parameters for initial configuration
    config = data_helper.BaseConfiguration()
    config.MAX_SENTS = 10

    # load data
    data_filters = {'SPEAKER': set(['P'])}
    data_filters['CODE'] = set([
        '0+2', '0+3', '0+4', '0-2', '0-3', '0-4', 'C+2', 'C+3', 'C+4', 'C-4',
        'FN', 'Fn', 'O+1', 'O+2', 'O+3', 'O+4', 'O+5', 'O-1', 'O-2', 'O-3',
        'O-4', 'R+1', 'R+2', 'R+3', 'R+4', 'R-1', 'R-2', 'R-3', 'R-4', 'RA+2',
        'RA+3', 'RA-2', 'RA-3', 'RD+3', 'RD+4', 'RD-3', 'RN+2', 'RN+3', 'Ra+2',
        'Ra+3', 'Rd+3', 'Rn+3', 'TS+2', 'TS+3', 'TS+4', 'TS-2', 'Ts+2', 'c+1',
        'c+2', 'c+3', 'c+4', 'c+5', 'c-1', 'c-2', 'c-3', 'c-4', 'f n', 'fn',
        'o+1', 'o+2', 'o+3', 'o+4', 'o+5', 'o-1', 'o-2', 'o-3', 'o-4', 'o-5',
        'r+1', 'r+2', 'r+3', 'r+4', 'r+5', 'r-1', 'r-2', 'r-3', 'r-4', 'r-5',
        'ra+1', 'ra+2', 'ra+3', 'ra+4', 'ra+5', 'ra-1', 'ra-2', 'ra-3', 'ra-4',
        'ra-5', 'rd+1', 'rd+2', 'rd+3', 'rd+4', 'rd+5', 'rd-2', 'rd-3', 'rd-4',
        'rn+1', 'rn+2', 'rn+3', 'rn+4', 'rn-3', 'rn-4', 'rs+2', 'ts+1', 'st',
        'ts+2', 'ts+3', 'ts+4', 'ts-1', 'ts-2', 'ts-3'
    ])

    data_tuples = extract_data_hdf5.extract_hdf5("./MI_hdf5/*.hdf5",
                                                 filters=data_filters,
                                                 min_seqs=config.seq_min_len,
                                                 include_pre_contxt=True)
    dataset = []
    contexts = []
    labels = []
    codes = []
    domain_labels = []

    for data, context, label, code, da_label in data_tuples:
        dataset.append(data)
        contexts.append(context)
        labels.append(label)
        codes.append(code)
        domain_labels.append(da_label)
    print('Whole dataset has ' + str(len(dataset)) + ' samples...............')

    # preprocess the data
    dataset = data_helper.preproc_data(dataset)
    contexts = data_helper.preproc_data(contexts)
    labels = np.asarray(labels)
    codes = np.asarray(
        data_helper.encode_codes(codes))  # encode the codes to -1, 0, 1
    domain_labels = np.asarray(domain_labels)

    all_dataset = dataset + contexts

    # in order to overcome the version problem
    if sys.version_info[0] == 2:
        from unidecode import unidecode
        processed_dataset, keras_tokenizer = data_helper.padding2sequences(
            [unidecode(item) for item in all_dataset],
            MAX_NB_WORDS=config.vocabulary_size,
            MAX_SEQUENCE_LENGTH=config.seq_max_len)
    else:
        processed_dataset, keras_tokenizer = data_helper.padding2sequences(
            all_dataset,
            MAX_NB_WORDS=config.vocabulary_size,
            MAX_SEQUENCE_LENGTH=config.seq_max_len)

    processed_labels, labels_tokenizer = data_helper.padding2sequences(
        [' '.join(items) for items in labels],
        MAX_NB_WORDS=config.vocabulary_size,
        MAX_SEQUENCE_LENGTH=10)
    processed_contexts = processed_dataset[len(dataset):]
    processed_dataset = processed_dataset[:len(dataset)]

    if keras.__version__.startswith('1'):
        code_targets = np_utils.to_categorical(codes, nb_classes=3)
    else:
        code_targets = np_utils.to_categorical(codes, num_classes=3)

    processed_dataset = np.asarray(processed_dataset)
    processed_contexts = np.asarray(processed_contexts)
    processed_labels = np.asarray(processed_labels)

    train_indices, valid_indices, test_indices = data_helper.split_tvt_idx(
        processed_dataset)

    # save the data to json file for further references.
    data_whole = dict()
    data_whole['dataset_raw'] = dataset
    data_whole['context_raw'] = contexts
    data_whole['labels'] = labels
    data_whole['codes'] = codes
    data_whole['domain_labels'] = domain_labels

    data_whole['dataset_processed'] = processed_dataset
    data_whole['context_processed'] = processed_contexts
    data_whole['codes_preprocessed'] = code_targets
    data_whole['labels_preprocessed'] = processed_labels
    data_whole['train_indices'] = train_indices
    data_whole['valid_indices'] = valid_indices
    data_whole['test_indices'] = test_indices
    data_whole['keras_tokenizer'] = keras_tokenizer
    data_whole['labels_tokenizer'] = labels_tokenizer

    if sys.version_info[0] == 2:
        import codecs
        data_file = codecs.open(output, 'wb')
    else:
        data_file = open(output, 'wb')

    pickle.dump(data_whole, data_file, protocol=2)
    pickle.dump(keras_tokenizer,
                open("./preprocessed_data/keras_tokenizer.pkl", "wb"))
    pickle.dump(labels_tokenizer,
                open("./preprocessed_data/labels_tokenizer.pkl", "wb"))
    data_file.close()