def load_and_process(train_data_file, test_data_file=None, train_tokens_file=None, test_tokens_file=None, embed_size=300, max_comment_size=250, label_names=None, fraction_dev=0.3, debug=False): # Get glove/w2v data emb_data = preprocess.get_glove(embed_size) # Load and (optionally) subset train data train_data = preprocess.load_data(train_data_file, debug=debug) # Load test data if test_data_file: test_data = preprocess.load_data(test_data_file, debug=debug) id_test = test_data['id'] # Tokenize train comments or load pre-tokenized train comments if debug or (train_tokens_file is None): tokens = preprocess.tokenize_df(train_data) else: tokens = preprocess.load_tokenized_comments(train_tokens_file) # Pad and create masks for train comments tokens, masks = preprocess.pad_comments(tokens, max_comment_size) # Tokenize test comments or load pre-tokenized test comments if test_data_file: if test_tokens_file is None: tokens_test = preprocess.tokenize_df(test_data) else: tokens_test = preprocess.load_tokenized_comments(test_tokens_file) # Pad and create masks for train comments tokens_test, masks_test = preprocess.pad_comments( tokens_test, max_comment_size) # Load train labels if label_names is None: label_names = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] labels = preprocess.filter_labels(train_data, label_names) # Split to train and dev sets train_dev_set = preprocess.split_train_dev(tokens, labels, masks, fraction_dev=fraction_dev) if test_data_file: test_set = (id_test, tokens_test, masks_test) else: test_set = None return emb_data, train_dev_set, test_set
# path of the data train_data_file = '../data/train.csv' test_data_file = '../data/test.csv' train_tokenized_file = '../data/train_comments.p' test_tokenized_file = '../data/test_comments.p' # load data train_data = preprocess.load_data(train_data_file) test_data = preprocess.load_data(test_data_file) # size of the training data n_train = len(train_data) # training data tokenzied train_tokenized = preprocess.tokenize_df(train_data) print "Tokenized {} comments from train data.".format(n_train) # write tokenzied training data into .p file with open(train_tokenized_file, 'wb') as f: pickle.dump(train_tokenized, f) print "Saved {} tokenized comments from train data to {} ".format( n_train, train_tokenized_file) # same with the testing data as above n_test = len(test_data) test_tokenized = preprocess.tokenize_df(test_data) print "Tokenized {} comments from test data.".format(n_test) with open(test_tokenized_file, 'wb') as f: pickle.dump(test_tokenized, f) print "Saved {} tokenized comments from test data to {} ".format(