def generate_train_data(train_data_file, word_vectors_file, N=300000, N_dev=8000): # load word vectors and PoS one-hot vectors. embed_map = EmbedMap(word_vectors_file) # load training data file. print('loading training data from %s' % train_data_file) train_reader = jsonlines.open(train_data_file) # preprocess the training data and generate training instances. train_data = [] for doc_data in train_reader.iter(): doc = Document(doc_data, embed_map) train_data += doc.generate_gold_mention_pairs() print("---> total number of training pairs: %s" % len(train_data)) # shuffle the training data and convert to seperate numpy arrays. print('---> shuffle and devide into %s train-pairs and %s dev-pairs.' % (N, N_dev)) shuffle(train_data) assert (N + N_dev) < len(train_data), 'not enough training data to have train/dev split: %s/%s' % (N, N_dev) data_X, data_y = convert_train_data(train_data) train_X, train_y = [cn[:N] for cn in data_X], data_y[:N] dev_X, dev_y = [cn[N:N + N_dev] for cn in data_X], data_y[N:N + N_dev] return {'train_X': train_X, 'train_y': train_y, 'dev_X': dev_X, 'dev_y': dev_y}