def load_and_process(train_data_file,
                     test_data_file=None,
                     train_tokens_file=None,
                     test_tokens_file=None,
                     embed_size=300,
                     max_comment_size=250,
                     label_names=None,
                     fraction_dev=0.3,
                     debug=False):
    # Get glove/w2v data
    emb_data = preprocess.get_glove(embed_size)

    # Load and (optionally) subset train data
    train_data = preprocess.load_data(train_data_file, debug=debug)

    # Load test data
    if test_data_file:
        test_data = preprocess.load_data(test_data_file, debug=debug)
        id_test = test_data['id']

    # Tokenize train comments or load pre-tokenized train comments
    if debug or (train_tokens_file is None):
        tokens = preprocess.tokenize_df(train_data)
    else:
        tokens = preprocess.load_tokenized_comments(train_tokens_file)
    # Pad and create masks for train comments
    tokens, masks = preprocess.pad_comments(tokens, max_comment_size)

    # Tokenize test comments or load pre-tokenized test comments
    if test_data_file:
        if test_tokens_file is None:
            tokens_test = preprocess.tokenize_df(test_data)
        else:
            tokens_test = preprocess.load_tokenized_comments(test_tokens_file)
        # Pad and create masks for train comments
        tokens_test, masks_test = preprocess.pad_comments(
            tokens_test, max_comment_size)

    # Load train labels
    if label_names is None:
        label_names = [
            'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
            'identity_hate'
        ]
    labels = preprocess.filter_labels(train_data, label_names)

    # Split to train and dev sets
    train_dev_set = preprocess.split_train_dev(tokens,
                                               labels,
                                               masks,
                                               fraction_dev=fraction_dev)
    if test_data_file:
        test_set = (id_test, tokens_test, masks_test)
    else:
        test_set = None

    return emb_data, train_dev_set, test_set
Ejemplo n.º 2
0
 def __init__(self, config=None, emb_data=None, glove_dim=None):
     # Load word embedding data from memory if already loaded
     if emb_data is not None:
         self.emb_matrix = emb_data[0].astype('float32')
         self.word2id = emb_data[1]
         self.id2word = emb_data[2]
     # Load glove data from file
     elif glove_dim is not None:
         self.emb_matrix, self.word2id, self.id2word = get_glove(glove_dim)
         self.emb_matrix = self.emb_matrix.astype('float32')
     # Load config and build
     self.config = Config(config)
     self.build()
def load_model():
    save_prefix = os.path.join(out_dir, config['exp_name'], config['exp_name'])
    emb_data = preprocess.get_glove(embed_size)
    model = rnn_model.PredictWithRNNModel(config, emb_data, save_prefix)
    return model