def load_and_process(train_data_file,
                     test_data_file=None,
                     train_tokens_file=None,
                     test_tokens_file=None,
                     embed_size=300,
                     max_comment_size=250,
                     label_names=None,
                     fraction_dev=0.3,
                     debug=False):
    # Get glove/w2v data
    emb_data = preprocess.get_glove(embed_size)

    # Load and (optionally) subset train data
    train_data = preprocess.load_data(train_data_file, debug=debug)

    # Load test data
    if test_data_file:
        test_data = preprocess.load_data(test_data_file, debug=debug)
        id_test = test_data['id']

    # Tokenize train comments or load pre-tokenized train comments
    if debug or (train_tokens_file is None):
        tokens = preprocess.tokenize_df(train_data)
    else:
        tokens = preprocess.load_tokenized_comments(train_tokens_file)
    # Pad and create masks for train comments
    tokens, masks = preprocess.pad_comments(tokens, max_comment_size)

    # Tokenize test comments or load pre-tokenized test comments
    if test_data_file:
        if test_tokens_file is None:
            tokens_test = preprocess.tokenize_df(test_data)
        else:
            tokens_test = preprocess.load_tokenized_comments(test_tokens_file)
        # Pad and create masks for train comments
        tokens_test, masks_test = preprocess.pad_comments(
            tokens_test, max_comment_size)

    # Load train labels
    if label_names is None:
        label_names = [
            'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
            'identity_hate'
        ]
    labels = preprocess.filter_labels(train_data, label_names)

    # Split to train and dev sets
    train_dev_set = preprocess.split_train_dev(tokens,
                                               labels,
                                               masks,
                                               fraction_dev=fraction_dev)
    if test_data_file:
        test_set = (id_test, tokens_test, masks_test)
    else:
        test_set = None

    return emb_data, train_dev_set, test_set
# path of the data
train_data_file = '../data/train.csv'
test_data_file = '../data/test.csv'

train_tokenized_file = '../data/train_comments.p'
test_tokenized_file = '../data/test_comments.p'

# load data
train_data = preprocess.load_data(train_data_file)
test_data = preprocess.load_data(test_data_file)

# size of the training data
n_train = len(train_data)

# training data tokenzied
train_tokenized = preprocess.tokenize_df(train_data)
print "Tokenized {} comments from train data.".format(n_train)

# write tokenzied training data into .p file
with open(train_tokenized_file, 'wb') as f:
    pickle.dump(train_tokenized, f)
    print "Saved {} tokenized comments from train data to {} ".format(
        n_train, train_tokenized_file)

# same with the testing data as above
n_test = len(test_data)
test_tokenized = preprocess.tokenize_df(test_data)
print "Tokenized {} comments from test data.".format(n_test)
with open(test_tokenized_file, 'wb') as f:
    pickle.dump(test_tokenized, f)
    print "Saved {} tokenized comments from test data to {} ".format(